diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,18200 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 2.999297541394882, + "eval_steps": 400, + "global_step": 5604, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.002676032781401572, + "grad_norm": 5.007836359233225, + "learning_rate": 8.9126559714795e-09, + "logits/chosen": -0.0686589926481247, + "logits/rejected": 0.14136984944343567, + "logps/chosen": -1.7160040140151978, + "logps/rejected": -1.889505386352539, + "loss": 0.7102, + "rewards/accuracies": 0.5687500238418579, + "rewards/chosen": -1.7160040140151978, + "rewards/margins": 0.1735011637210846, + "rewards/rejected": -1.889505386352539, + "sft_loss": 1.468440294265747, + "step": 5 + }, + { + "epoch": 0.005352065562803144, + "grad_norm": 9.54345972209578, + "learning_rate": 1.7825311942959e-08, + "logits/chosen": -0.007096876855939627, + "logits/rejected": 0.11429889500141144, + "logps/chosen": -1.802136778831482, + "logps/rejected": -1.8458713293075562, + "loss": 0.7922, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": -1.802136778831482, + "rewards/margins": 0.043734706938266754, + "rewards/rejected": -1.8458713293075562, + "sft_loss": 1.5083144903182983, + "step": 10 + }, + { + "epoch": 0.008028098344204716, + "grad_norm": 10.76009186748774, + "learning_rate": 2.67379679144385e-08, + "logits/chosen": -0.03930598497390747, + "logits/rejected": 0.06061038374900818, + "logps/chosen": -1.635589838027954, + "logps/rejected": -1.7648627758026123, + "loss": 0.7673, + "rewards/accuracies": 0.4937500059604645, + "rewards/chosen": -1.635589838027954, + "rewards/margins": 0.12927308678627014, + "rewards/rejected": -1.7648627758026123, + "sft_loss": 1.5007826089859009, + "step": 15 + }, + { + "epoch": 0.010704131125606288, + "grad_norm": 5.0343169890815735, + "learning_rate": 3.5650623885918e-08, + "logits/chosen": -0.039493732154369354, + "logits/rejected": 0.04849349707365036, + "logps/chosen": -1.7248096466064453, + "logps/rejected": -1.8056780099868774, + "loss": 0.791, + "rewards/accuracies": 0.48750001192092896, + "rewards/chosen": -1.7248096466064453, + "rewards/margins": 0.08086834847927094, + "rewards/rejected": -1.8056780099868774, + "sft_loss": 1.5005706548690796, + "step": 20 + }, + { + "epoch": 0.013380163907007862, + "grad_norm": 16.01029536696111, + "learning_rate": 4.45632798573975e-08, + "logits/chosen": -0.06505529582500458, + "logits/rejected": 0.020799441263079643, + "logps/chosen": -1.8701883554458618, + "logps/rejected": -1.7795698642730713, + "loss": 0.8987, + "rewards/accuracies": 0.3812499940395355, + "rewards/chosen": -1.8701883554458618, + "rewards/margins": -0.09061814099550247, + "rewards/rejected": -1.7795698642730713, + "sft_loss": 1.5459671020507812, + "step": 25 + }, + { + "epoch": 0.016056196688409432, + "grad_norm": 8.419472984626248, + "learning_rate": 5.3475935828877e-08, + "logits/chosen": -0.09517794847488403, + "logits/rejected": 8.840263035381213e-05, + "logps/chosen": -1.9080820083618164, + "logps/rejected": -1.8322795629501343, + "loss": 0.8508, + "rewards/accuracies": 0.4375, + "rewards/chosen": -1.9080820083618164, + "rewards/margins": -0.07580234855413437, + "rewards/rejected": -1.8322795629501343, + "sft_loss": 1.6464264392852783, + "step": 30 + }, + { + "epoch": 0.018732229469811006, + "grad_norm": 9.603595781210746, + "learning_rate": 6.23885918003565e-08, + "logits/chosen": -0.047932375222444534, + "logits/rejected": 0.11526918411254883, + "logps/chosen": -1.8458175659179688, + "logps/rejected": -1.9971166849136353, + "loss": 0.8152, + "rewards/accuracies": 0.48124998807907104, + "rewards/chosen": -1.8458175659179688, + "rewards/margins": 0.15129896998405457, + "rewards/rejected": -1.9971166849136353, + "sft_loss": 1.5614925622940063, + "step": 35 + }, + { + "epoch": 0.021408262251212576, + "grad_norm": 9.043739776427191, + "learning_rate": 7.1301247771836e-08, + "logits/chosen": 0.0321493037045002, + "logits/rejected": 0.21030040085315704, + "logps/chosen": -1.8800443410873413, + "logps/rejected": -1.742314100265503, + "loss": 0.8692, + "rewards/accuracies": 0.45625001192092896, + "rewards/chosen": -1.8800443410873413, + "rewards/margins": -0.13773025572299957, + "rewards/rejected": -1.742314100265503, + "sft_loss": 1.5186289548873901, + "step": 40 + }, + { + "epoch": 0.02408429503261415, + "grad_norm": 14.676100184258393, + "learning_rate": 8.021390374331551e-08, + "logits/chosen": 0.027946826070547104, + "logits/rejected": 0.23048114776611328, + "logps/chosen": -1.8351805210113525, + "logps/rejected": -1.8704410791397095, + "loss": 0.8339, + "rewards/accuracies": 0.48750001192092896, + "rewards/chosen": -1.8351805210113525, + "rewards/margins": 0.03526050224900246, + "rewards/rejected": -1.8704410791397095, + "sft_loss": 1.5355684757232666, + "step": 45 + }, + { + "epoch": 0.026760327814015723, + "grad_norm": 11.460982576779914, + "learning_rate": 8.9126559714795e-08, + "logits/chosen": -0.04780071973800659, + "logits/rejected": 0.10815383493900299, + "logps/chosen": -1.8967111110687256, + "logps/rejected": -1.7773425579071045, + "loss": 0.8844, + "rewards/accuracies": 0.512499988079071, + "rewards/chosen": -1.8967111110687256, + "rewards/margins": -0.11936845630407333, + "rewards/rejected": -1.7773425579071045, + "sft_loss": 1.5827276706695557, + "step": 50 + }, + { + "epoch": 0.029436360595417294, + "grad_norm": 7.361779322900751, + "learning_rate": 9.80392156862745e-08, + "logits/chosen": -0.12230806052684784, + "logits/rejected": 0.0970315933227539, + "logps/chosen": -1.830540657043457, + "logps/rejected": -1.8639686107635498, + "loss": 0.8545, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": -1.830540657043457, + "rewards/margins": 0.03342791646718979, + "rewards/rejected": -1.8639686107635498, + "sft_loss": 1.5817229747772217, + "step": 55 + }, + { + "epoch": 0.032112393376818864, + "grad_norm": 7.008292330029091, + "learning_rate": 1.06951871657754e-07, + "logits/chosen": -0.09996206313371658, + "logits/rejected": 0.09421779960393906, + "logps/chosen": -1.7850735187530518, + "logps/rejected": -1.890228271484375, + "loss": 0.7729, + "rewards/accuracies": 0.5249999761581421, + "rewards/chosen": -1.7850735187530518, + "rewards/margins": 0.10515467822551727, + "rewards/rejected": -1.890228271484375, + "sft_loss": 1.5428695678710938, + "step": 60 + }, + { + "epoch": 0.03478842615822044, + "grad_norm": 5.8428555921928265, + "learning_rate": 1.158645276292335e-07, + "logits/chosen": -0.01847922056913376, + "logits/rejected": 0.13318422436714172, + "logps/chosen": -1.6318439245224, + "logps/rejected": -1.7609220743179321, + "loss": 0.7337, + "rewards/accuracies": 0.5249999761581421, + "rewards/chosen": -1.6318439245224, + "rewards/margins": 0.12907817959785461, + "rewards/rejected": -1.7609220743179321, + "sft_loss": 1.4719539880752563, + "step": 65 + }, + { + "epoch": 0.03746445893962201, + "grad_norm": 11.535699964755835, + "learning_rate": 1.24777183600713e-07, + "logits/chosen": -0.06903735548257828, + "logits/rejected": 0.08338301628828049, + "logps/chosen": -1.7622463703155518, + "logps/rejected": -1.8080501556396484, + "loss": 0.8392, + "rewards/accuracies": 0.42500001192092896, + "rewards/chosen": -1.7622463703155518, + "rewards/margins": 0.045803725719451904, + "rewards/rejected": -1.8080501556396484, + "sft_loss": 1.6276578903198242, + "step": 70 + }, + { + "epoch": 0.04014049172102358, + "grad_norm": 12.274354805500392, + "learning_rate": 1.3368983957219251e-07, + "logits/chosen": -0.049991387873888016, + "logits/rejected": 0.13196972012519836, + "logps/chosen": -1.7706562280654907, + "logps/rejected": -2.0294816493988037, + "loss": 0.7273, + "rewards/accuracies": 0.5249999761581421, + "rewards/chosen": -1.7706562280654907, + "rewards/margins": 0.25882548093795776, + "rewards/rejected": -2.0294816493988037, + "sft_loss": 1.5631906986236572, + "step": 75 + }, + { + "epoch": 0.04281652450242515, + "grad_norm": 8.065691812498399, + "learning_rate": 1.42602495543672e-07, + "logits/chosen": 0.007000925950706005, + "logits/rejected": 0.11386320739984512, + "logps/chosen": -1.7066723108291626, + "logps/rejected": -1.739593505859375, + "loss": 0.7943, + "rewards/accuracies": 0.518750011920929, + "rewards/chosen": -1.7066723108291626, + "rewards/margins": 0.03292134404182434, + "rewards/rejected": -1.739593505859375, + "sft_loss": 1.5200769901275635, + "step": 80 + }, + { + "epoch": 0.04549255728382673, + "grad_norm": 5.087382142819339, + "learning_rate": 1.5151515151515152e-07, + "logits/chosen": -0.13984394073486328, + "logits/rejected": 0.11329865455627441, + "logps/chosen": -1.770267128944397, + "logps/rejected": -1.9428346157073975, + "loss": 0.7577, + "rewards/accuracies": 0.512499988079071, + "rewards/chosen": -1.770267128944397, + "rewards/margins": 0.17256739735603333, + "rewards/rejected": -1.9428346157073975, + "sft_loss": 1.488335371017456, + "step": 85 + }, + { + "epoch": 0.0481685900652283, + "grad_norm": 14.454887817717022, + "learning_rate": 1.6042780748663102e-07, + "logits/chosen": 0.10436830669641495, + "logits/rejected": 0.06718367338180542, + "logps/chosen": -1.7230262756347656, + "logps/rejected": -1.757142424583435, + "loss": 0.8211, + "rewards/accuracies": 0.4625000059604645, + "rewards/chosen": -1.7230262756347656, + "rewards/margins": 0.03411626070737839, + "rewards/rejected": -1.757142424583435, + "sft_loss": 1.4506069421768188, + "step": 90 + }, + { + "epoch": 0.05084462284662987, + "grad_norm": 5.421984139338058, + "learning_rate": 1.693404634581105e-07, + "logits/chosen": -0.06644740700721741, + "logits/rejected": 0.08747534453868866, + "logps/chosen": -1.7685035467147827, + "logps/rejected": -1.884555459022522, + "loss": 0.7823, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": -1.7685035467147827, + "rewards/margins": 0.11605201661586761, + "rewards/rejected": -1.884555459022522, + "sft_loss": 1.5129237174987793, + "step": 95 + }, + { + "epoch": 0.05352065562803145, + "grad_norm": 4.635883271568906, + "learning_rate": 1.7825311942959e-07, + "logits/chosen": -0.027575846761465073, + "logits/rejected": 0.040067609399557114, + "logps/chosen": -1.6706348657608032, + "logps/rejected": -1.7762954235076904, + "loss": 0.7543, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.6706348657608032, + "rewards/margins": 0.1056608110666275, + "rewards/rejected": -1.7762954235076904, + "sft_loss": 1.4829761981964111, + "step": 100 + }, + { + "epoch": 0.05619668840943302, + "grad_norm": 9.791136385045762, + "learning_rate": 1.8716577540106952e-07, + "logits/chosen": 0.06704328954219818, + "logits/rejected": 0.09551501274108887, + "logps/chosen": -1.6210968494415283, + "logps/rejected": -1.790696144104004, + "loss": 0.7318, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": -1.6210968494415283, + "rewards/margins": 0.16959939897060394, + "rewards/rejected": -1.790696144104004, + "sft_loss": 1.4277700185775757, + "step": 105 + }, + { + "epoch": 0.05887272119083459, + "grad_norm": 6.014061678484121, + "learning_rate": 1.96078431372549e-07, + "logits/chosen": 0.0035847374238073826, + "logits/rejected": 0.10112349689006805, + "logps/chosen": -1.6423265933990479, + "logps/rejected": -1.6988475322723389, + "loss": 0.7825, + "rewards/accuracies": 0.5062500238418579, + "rewards/chosen": -1.6423265933990479, + "rewards/margins": 0.05652119964361191, + "rewards/rejected": -1.6988475322723389, + "sft_loss": 1.4510281085968018, + "step": 110 + }, + { + "epoch": 0.06154875397223616, + "grad_norm": 9.636781087531212, + "learning_rate": 2.049910873440285e-07, + "logits/chosen": 0.0216152872890234, + "logits/rejected": 0.23170170187950134, + "logps/chosen": -1.6151697635650635, + "logps/rejected": -1.8834221363067627, + "loss": 0.6893, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.6151697635650635, + "rewards/margins": 0.2682521939277649, + "rewards/rejected": -1.8834221363067627, + "sft_loss": 1.5399529933929443, + "step": 115 + }, + { + "epoch": 0.06422478675363773, + "grad_norm": 5.837092063890294, + "learning_rate": 2.13903743315508e-07, + "logits/chosen": -0.09482669830322266, + "logits/rejected": 0.07769424468278885, + "logps/chosen": -1.670412302017212, + "logps/rejected": -1.7851431369781494, + "loss": 0.7476, + "rewards/accuracies": 0.4937500059604645, + "rewards/chosen": -1.670412302017212, + "rewards/margins": 0.11473057419061661, + "rewards/rejected": -1.7851431369781494, + "sft_loss": 1.52475905418396, + "step": 120 + }, + { + "epoch": 0.0669008195350393, + "grad_norm": 4.61651137551121, + "learning_rate": 2.2281639928698751e-07, + "logits/chosen": -0.0943833515048027, + "logits/rejected": 0.035781342536211014, + "logps/chosen": -1.601912498474121, + "logps/rejected": -1.5632137060165405, + "loss": 0.7998, + "rewards/accuracies": 0.4937500059604645, + "rewards/chosen": -1.601912498474121, + "rewards/margins": -0.03869876265525818, + "rewards/rejected": -1.5632137060165405, + "sft_loss": 1.4977911710739136, + "step": 125 + }, + { + "epoch": 0.06957685231644088, + "grad_norm": 8.07442245018302, + "learning_rate": 2.31729055258467e-07, + "logits/chosen": 0.04960859939455986, + "logits/rejected": 0.18758010864257812, + "logps/chosen": -1.6405330896377563, + "logps/rejected": -1.759902000427246, + "loss": 0.7124, + "rewards/accuracies": 0.4749999940395355, + "rewards/chosen": -1.6405330896377563, + "rewards/margins": 0.11936911195516586, + "rewards/rejected": -1.759902000427246, + "sft_loss": 1.5563743114471436, + "step": 130 + }, + { + "epoch": 0.07225288509784245, + "grad_norm": 15.363477991757385, + "learning_rate": 2.406417112299465e-07, + "logits/chosen": -0.0477161630988121, + "logits/rejected": 0.07339635491371155, + "logps/chosen": -1.6893672943115234, + "logps/rejected": -1.7175499200820923, + "loss": 0.7915, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.6893672943115234, + "rewards/margins": 0.028182348236441612, + "rewards/rejected": -1.7175499200820923, + "sft_loss": 1.4941551685333252, + "step": 135 + }, + { + "epoch": 0.07492891787924402, + "grad_norm": 8.438465773901987, + "learning_rate": 2.49554367201426e-07, + "logits/chosen": -0.04275672882795334, + "logits/rejected": 0.12644967436790466, + "logps/chosen": -1.6506645679473877, + "logps/rejected": -1.7775901556015015, + "loss": 0.7327, + "rewards/accuracies": 0.5062500238418579, + "rewards/chosen": -1.6506645679473877, + "rewards/margins": 0.12692561745643616, + "rewards/rejected": -1.7775901556015015, + "sft_loss": 1.5400911569595337, + "step": 140 + }, + { + "epoch": 0.0776049506606456, + "grad_norm": 9.072723379650128, + "learning_rate": 2.5846702317290554e-07, + "logits/chosen": -0.02842085435986519, + "logits/rejected": 0.12878260016441345, + "logps/chosen": -1.56234610080719, + "logps/rejected": -1.6720802783966064, + "loss": 0.7311, + "rewards/accuracies": 0.4749999940395355, + "rewards/chosen": -1.56234610080719, + "rewards/margins": 0.10973384231328964, + "rewards/rejected": -1.6720802783966064, + "sft_loss": 1.4896290302276611, + "step": 145 + }, + { + "epoch": 0.08028098344204716, + "grad_norm": 11.917688310921138, + "learning_rate": 2.6737967914438503e-07, + "logits/chosen": -0.08208204805850983, + "logits/rejected": 0.0795825868844986, + "logps/chosen": -1.5122708082199097, + "logps/rejected": -1.5111819505691528, + "loss": 0.7804, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.5122708082199097, + "rewards/margins": -0.0010888517135754228, + "rewards/rejected": -1.5111819505691528, + "sft_loss": 1.3492343425750732, + "step": 150 + }, + { + "epoch": 0.08295701622344874, + "grad_norm": 7.993571495447725, + "learning_rate": 2.762923351158645e-07, + "logits/chosen": -0.04924124851822853, + "logits/rejected": 0.004975716583430767, + "logps/chosen": -1.5232518911361694, + "logps/rejected": -1.6216939687728882, + "loss": 0.7286, + "rewards/accuracies": 0.518750011920929, + "rewards/chosen": -1.5232518911361694, + "rewards/margins": 0.098441943526268, + "rewards/rejected": -1.6216939687728882, + "sft_loss": 1.4322659969329834, + "step": 155 + }, + { + "epoch": 0.0856330490048503, + "grad_norm": 7.499988200221608, + "learning_rate": 2.85204991087344e-07, + "logits/chosen": -0.1464027464389801, + "logits/rejected": -0.003987524192780256, + "logps/chosen": -1.6318111419677734, + "logps/rejected": -1.6092545986175537, + "loss": 0.805, + "rewards/accuracies": 0.4625000059604645, + "rewards/chosen": -1.6318111419677734, + "rewards/margins": -0.02255646511912346, + "rewards/rejected": -1.6092545986175537, + "sft_loss": 1.4808762073516846, + "step": 160 + }, + { + "epoch": 0.08830908178625188, + "grad_norm": 7.056161316365935, + "learning_rate": 2.941176470588235e-07, + "logits/chosen": -0.07448373734951019, + "logits/rejected": 0.09501216560602188, + "logps/chosen": -1.4764728546142578, + "logps/rejected": -1.5967134237289429, + "loss": 0.7312, + "rewards/accuracies": 0.518750011920929, + "rewards/chosen": -1.4764728546142578, + "rewards/margins": 0.1202404722571373, + "rewards/rejected": -1.5967134237289429, + "sft_loss": 1.372351050376892, + "step": 165 + }, + { + "epoch": 0.09098511456765346, + "grad_norm": 13.80003838568036, + "learning_rate": 3.0303030303030305e-07, + "logits/chosen": -0.09532450139522552, + "logits/rejected": -0.03916095569729805, + "logps/chosen": -1.5926252603530884, + "logps/rejected": -1.6508491039276123, + "loss": 0.7664, + "rewards/accuracies": 0.5062500238418579, + "rewards/chosen": -1.5926252603530884, + "rewards/margins": 0.0582241527736187, + "rewards/rejected": -1.6508491039276123, + "sft_loss": 1.4843275547027588, + "step": 170 + }, + { + "epoch": 0.09366114734905502, + "grad_norm": 7.915014598111359, + "learning_rate": 3.1194295900178254e-07, + "logits/chosen": 0.04395443946123123, + "logits/rejected": 0.04227130487561226, + "logps/chosen": -1.4485992193222046, + "logps/rejected": -1.549605369567871, + "loss": 0.73, + "rewards/accuracies": 0.518750011920929, + "rewards/chosen": -1.4485992193222046, + "rewards/margins": 0.1010061502456665, + "rewards/rejected": -1.549605369567871, + "sft_loss": 1.4186640977859497, + "step": 175 + }, + { + "epoch": 0.0963371801304566, + "grad_norm": 7.065980369491947, + "learning_rate": 3.2085561497326203e-07, + "logits/chosen": -0.0706276148557663, + "logits/rejected": -0.07220318913459778, + "logps/chosen": -1.4419519901275635, + "logps/rejected": -1.6293582916259766, + "loss": 0.7267, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -1.4419519901275635, + "rewards/margins": 0.18740621209144592, + "rewards/rejected": -1.6293582916259766, + "sft_loss": 1.414329171180725, + "step": 180 + }, + { + "epoch": 0.09901321291185818, + "grad_norm": 6.786407778120364, + "learning_rate": 3.297682709447415e-07, + "logits/chosen": -0.16844771802425385, + "logits/rejected": -0.08266721665859222, + "logps/chosen": -1.3996317386627197, + "logps/rejected": -1.4599860906600952, + "loss": 0.7491, + "rewards/accuracies": 0.48124998807907104, + "rewards/chosen": -1.3996317386627197, + "rewards/margins": 0.0603543221950531, + "rewards/rejected": -1.4599860906600952, + "sft_loss": 1.3800976276397705, + "step": 185 + }, + { + "epoch": 0.10168924569325974, + "grad_norm": 7.300767239806578, + "learning_rate": 3.38680926916221e-07, + "logits/chosen": -0.0973626971244812, + "logits/rejected": 0.022847438231110573, + "logps/chosen": -1.3400040864944458, + "logps/rejected": -1.474424123764038, + "loss": 0.6999, + "rewards/accuracies": 0.518750011920929, + "rewards/chosen": -1.3400040864944458, + "rewards/margins": 0.13442011177539825, + "rewards/rejected": -1.474424123764038, + "sft_loss": 1.327915906906128, + "step": 190 + }, + { + "epoch": 0.10436527847466132, + "grad_norm": 4.64361809307048, + "learning_rate": 3.475935828877005e-07, + "logits/chosen": -0.007407195866107941, + "logits/rejected": 0.14789652824401855, + "logps/chosen": -1.2831056118011475, + "logps/rejected": -1.4523200988769531, + "loss": 0.6801, + "rewards/accuracies": 0.59375, + "rewards/chosen": -1.2831056118011475, + "rewards/margins": 0.16921459138393402, + "rewards/rejected": -1.4523200988769531, + "sft_loss": 1.3099342584609985, + "step": 195 + }, + { + "epoch": 0.1070413112560629, + "grad_norm": 14.575211650225327, + "learning_rate": 3.5650623885918e-07, + "logits/chosen": -0.11465966701507568, + "logits/rejected": 0.021672243252396584, + "logps/chosen": -1.4081348180770874, + "logps/rejected": -1.4467805624008179, + "loss": 0.7459, + "rewards/accuracies": 0.53125, + "rewards/chosen": -1.4081348180770874, + "rewards/margins": 0.03864575922489166, + "rewards/rejected": -1.4467805624008179, + "sft_loss": 1.4106323719024658, + "step": 200 + }, + { + "epoch": 0.10971734403746446, + "grad_norm": 10.129209630022192, + "learning_rate": 3.654188948306595e-07, + "logits/chosen": -0.10025770962238312, + "logits/rejected": 0.03735864534974098, + "logps/chosen": -1.3229440450668335, + "logps/rejected": -1.3909223079681396, + "loss": 0.7343, + "rewards/accuracies": 0.5249999761581421, + "rewards/chosen": -1.3229440450668335, + "rewards/margins": 0.06797824800014496, + "rewards/rejected": -1.3909223079681396, + "sft_loss": 1.301814079284668, + "step": 205 + }, + { + "epoch": 0.11239337681886603, + "grad_norm": 7.458541707417236, + "learning_rate": 3.7433155080213904e-07, + "logits/chosen": -0.18131954967975616, + "logits/rejected": 0.0006055116537027061, + "logps/chosen": -1.3974190950393677, + "logps/rejected": -1.5311411619186401, + "loss": 0.7184, + "rewards/accuracies": 0.59375, + "rewards/chosen": -1.3974190950393677, + "rewards/margins": 0.13372211158275604, + "rewards/rejected": -1.5311411619186401, + "sft_loss": 1.3603262901306152, + "step": 210 + }, + { + "epoch": 0.1150694096002676, + "grad_norm": 5.689970505577374, + "learning_rate": 3.8324420677361853e-07, + "logits/chosen": -0.21051593124866486, + "logits/rejected": 0.03054944798350334, + "logps/chosen": -1.4112319946289062, + "logps/rejected": -1.481041431427002, + "loss": 0.7194, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": -1.4112319946289062, + "rewards/margins": 0.06980942189693451, + "rewards/rejected": -1.481041431427002, + "sft_loss": 1.383886456489563, + "step": 215 + }, + { + "epoch": 0.11774544238166917, + "grad_norm": 12.530800803177595, + "learning_rate": 3.92156862745098e-07, + "logits/chosen": 0.034305017441511154, + "logits/rejected": 0.13380172848701477, + "logps/chosen": -1.3511821031570435, + "logps/rejected": -1.5188138484954834, + "loss": 0.6987, + "rewards/accuracies": 0.5625, + "rewards/chosen": -1.3511821031570435, + "rewards/margins": 0.16763189435005188, + "rewards/rejected": -1.5188138484954834, + "sft_loss": 1.360771656036377, + "step": 220 + }, + { + "epoch": 0.12042147516307075, + "grad_norm": 4.653708323105172, + "learning_rate": 4.010695187165775e-07, + "logits/chosen": -0.12555362284183502, + "logits/rejected": 0.04124899208545685, + "logps/chosen": -1.353245496749878, + "logps/rejected": -1.4943913221359253, + "loss": 0.6904, + "rewards/accuracies": 0.5625, + "rewards/chosen": -1.353245496749878, + "rewards/margins": 0.14114579558372498, + "rewards/rejected": -1.4943913221359253, + "sft_loss": 1.3382834196090698, + "step": 225 + }, + { + "epoch": 0.12309750794447231, + "grad_norm": 5.33403533842893, + "learning_rate": 4.09982174688057e-07, + "logits/chosen": -0.031155142933130264, + "logits/rejected": 0.045900508761405945, + "logps/chosen": -1.3820817470550537, + "logps/rejected": -1.5517305135726929, + "loss": 0.6978, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": -1.3820817470550537, + "rewards/margins": 0.16964863240718842, + "rewards/rejected": -1.5517305135726929, + "sft_loss": 1.3105311393737793, + "step": 230 + }, + { + "epoch": 0.1257735407258739, + "grad_norm": 8.706238550801956, + "learning_rate": 4.188948306595365e-07, + "logits/chosen": -0.032305438071489334, + "logits/rejected": 0.09844879806041718, + "logps/chosen": -1.3472206592559814, + "logps/rejected": -1.5170398950576782, + "loss": 0.6835, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -1.3472206592559814, + "rewards/margins": 0.16981934010982513, + "rewards/rejected": -1.5170398950576782, + "sft_loss": 1.3197492361068726, + "step": 235 + }, + { + "epoch": 0.12844957350727546, + "grad_norm": 4.084985722679438, + "learning_rate": 4.27807486631016e-07, + "logits/chosen": -0.05923212692141533, + "logits/rejected": 0.06422128528356552, + "logps/chosen": -1.3529850244522095, + "logps/rejected": -1.5478001832962036, + "loss": 0.6863, + "rewards/accuracies": 0.543749988079071, + "rewards/chosen": -1.3529850244522095, + "rewards/margins": 0.19481512904167175, + "rewards/rejected": -1.5478001832962036, + "sft_loss": 1.3814536333084106, + "step": 240 + }, + { + "epoch": 0.13112560628867703, + "grad_norm": 6.9924765863104374, + "learning_rate": 4.3672014260249554e-07, + "logits/chosen": 0.0005951419589109719, + "logits/rejected": 0.11696485430002213, + "logps/chosen": -1.4723063707351685, + "logps/rejected": -1.5162469148635864, + "loss": 0.748, + "rewards/accuracies": 0.59375, + "rewards/chosen": -1.4723063707351685, + "rewards/margins": 0.0439404733479023, + "rewards/rejected": -1.5162469148635864, + "sft_loss": 1.464321255683899, + "step": 245 + }, + { + "epoch": 0.1338016390700786, + "grad_norm": 11.619283656380444, + "learning_rate": 4.4563279857397503e-07, + "logits/chosen": -0.10031737387180328, + "logits/rejected": 0.05885583162307739, + "logps/chosen": -1.3780508041381836, + "logps/rejected": -1.4445879459381104, + "loss": 0.7493, + "rewards/accuracies": 0.48750001192092896, + "rewards/chosen": -1.3780508041381836, + "rewards/margins": 0.06653715670108795, + "rewards/rejected": -1.4445879459381104, + "sft_loss": 1.3511133193969727, + "step": 250 + }, + { + "epoch": 0.1364776718514802, + "grad_norm": 8.252245686424416, + "learning_rate": 4.545454545454545e-07, + "logits/chosen": -0.04596395045518875, + "logits/rejected": 0.09472165256738663, + "logps/chosen": -1.327968955039978, + "logps/rejected": -1.4561989307403564, + "loss": 0.6962, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": -1.327968955039978, + "rewards/margins": 0.12822984158992767, + "rewards/rejected": -1.4561989307403564, + "sft_loss": 1.278329849243164, + "step": 255 + }, + { + "epoch": 0.13915370463288176, + "grad_norm": 5.269952180322018, + "learning_rate": 4.63458110516934e-07, + "logits/chosen": -0.2541922330856323, + "logits/rejected": -0.1493658423423767, + "logps/chosen": -1.4302836656570435, + "logps/rejected": -1.5907987356185913, + "loss": 0.6847, + "rewards/accuracies": 0.5625, + "rewards/chosen": -1.4302836656570435, + "rewards/margins": 0.1605151742696762, + "rewards/rejected": -1.5907987356185913, + "sft_loss": 1.414287805557251, + "step": 260 + }, + { + "epoch": 0.1418297374142833, + "grad_norm": 7.611663736654725, + "learning_rate": 4.723707664884135e-07, + "logits/chosen": -0.09324956685304642, + "logits/rejected": -0.008047522976994514, + "logps/chosen": -1.42179536819458, + "logps/rejected": -1.5962066650390625, + "loss": 0.699, + "rewards/accuracies": 0.581250011920929, + "rewards/chosen": -1.42179536819458, + "rewards/margins": 0.1744113266468048, + "rewards/rejected": -1.5962066650390625, + "sft_loss": 1.4485552310943604, + "step": 265 + }, + { + "epoch": 0.1445057701956849, + "grad_norm": 4.296214658770507, + "learning_rate": 4.81283422459893e-07, + "logits/chosen": -0.09900476038455963, + "logits/rejected": 0.034304648637771606, + "logps/chosen": -1.3736134767532349, + "logps/rejected": -1.4784634113311768, + "loss": 0.704, + "rewards/accuracies": 0.543749988079071, + "rewards/chosen": -1.3736134767532349, + "rewards/margins": 0.10485007613897324, + "rewards/rejected": -1.4784634113311768, + "sft_loss": 1.3692827224731445, + "step": 270 + }, + { + "epoch": 0.14718180297708647, + "grad_norm": 5.364644455566007, + "learning_rate": 4.901960784313725e-07, + "logits/chosen": -0.04653949290513992, + "logits/rejected": 0.05055801197886467, + "logps/chosen": -1.3204883337020874, + "logps/rejected": -1.5095690488815308, + "loss": 0.6815, + "rewards/accuracies": 0.581250011920929, + "rewards/chosen": -1.3204883337020874, + "rewards/margins": 0.18908073008060455, + "rewards/rejected": -1.5095690488815308, + "sft_loss": 1.2910387516021729, + "step": 275 + }, + { + "epoch": 0.14985783575848804, + "grad_norm": 6.003631973478166, + "learning_rate": 4.99108734402852e-07, + "logits/chosen": -0.10684315115213394, + "logits/rejected": 0.05572965741157532, + "logps/chosen": -1.391960859298706, + "logps/rejected": -1.5029346942901611, + "loss": 0.7129, + "rewards/accuracies": 0.5625, + "rewards/chosen": -1.391960859298706, + "rewards/margins": 0.11097397655248642, + "rewards/rejected": -1.5029346942901611, + "sft_loss": 1.3595014810562134, + "step": 280 + }, + { + "epoch": 0.15253386853988962, + "grad_norm": 6.480892543259098, + "learning_rate": 5.080213903743315e-07, + "logits/chosen": -0.07482358068227768, + "logits/rejected": 0.06803113967180252, + "logps/chosen": -1.3924249410629272, + "logps/rejected": -1.4932241439819336, + "loss": 0.727, + "rewards/accuracies": 0.543749988079071, + "rewards/chosen": -1.3924249410629272, + "rewards/margins": 0.1007990688085556, + "rewards/rejected": -1.4932241439819336, + "sft_loss": 1.4270622730255127, + "step": 285 + }, + { + "epoch": 0.1552099013212912, + "grad_norm": 6.616276424059221, + "learning_rate": 5.169340463458111e-07, + "logits/chosen": -0.12157426029443741, + "logits/rejected": 0.18175940215587616, + "logps/chosen": -1.414035439491272, + "logps/rejected": -1.5550696849822998, + "loss": 0.6967, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": -1.414035439491272, + "rewards/margins": 0.14103442430496216, + "rewards/rejected": -1.5550696849822998, + "sft_loss": 1.3971197605133057, + "step": 290 + }, + { + "epoch": 0.15788593410269275, + "grad_norm": 7.138403141788474, + "learning_rate": 5.258467023172905e-07, + "logits/chosen": -0.05374305695295334, + "logits/rejected": 0.003037288784980774, + "logps/chosen": -1.3270564079284668, + "logps/rejected": -1.4653503894805908, + "loss": 0.691, + "rewards/accuracies": 0.5687500238418579, + "rewards/chosen": -1.3270564079284668, + "rewards/margins": 0.13829405605793, + "rewards/rejected": -1.4653503894805908, + "sft_loss": 1.3143774271011353, + "step": 295 + }, + { + "epoch": 0.16056196688409433, + "grad_norm": 5.567542205148069, + "learning_rate": 5.347593582887701e-07, + "logits/chosen": -0.0868334174156189, + "logits/rejected": 0.08281473815441132, + "logps/chosen": -1.3654279708862305, + "logps/rejected": -1.4594337940216064, + "loss": 0.7129, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": -1.3654279708862305, + "rewards/margins": 0.09400572627782822, + "rewards/rejected": -1.4594337940216064, + "sft_loss": 1.4067200422286987, + "step": 300 + }, + { + "epoch": 0.1632379996654959, + "grad_norm": 4.6470009273355535, + "learning_rate": 5.436720142602496e-07, + "logits/chosen": -0.010817406699061394, + "logits/rejected": 0.0604841411113739, + "logps/chosen": -1.4783557653427124, + "logps/rejected": -1.4822551012039185, + "loss": 0.7712, + "rewards/accuracies": 0.48750001192092896, + "rewards/chosen": -1.4783557653427124, + "rewards/margins": 0.0038991898763924837, + "rewards/rejected": -1.4822551012039185, + "sft_loss": 1.456894874572754, + "step": 305 + }, + { + "epoch": 0.16591403244689748, + "grad_norm": 7.661301028049097, + "learning_rate": 5.52584670231729e-07, + "logits/chosen": -0.20085179805755615, + "logits/rejected": -0.10509626567363739, + "logps/chosen": -1.4366939067840576, + "logps/rejected": -1.537825345993042, + "loss": 0.7375, + "rewards/accuracies": 0.4937500059604645, + "rewards/chosen": -1.4366939067840576, + "rewards/margins": 0.10113133490085602, + "rewards/rejected": -1.537825345993042, + "sft_loss": 1.4186075925827026, + "step": 310 + }, + { + "epoch": 0.16859006522829906, + "grad_norm": 8.327325233802375, + "learning_rate": 5.614973262032086e-07, + "logits/chosen": -0.02356710098683834, + "logits/rejected": 0.13607418537139893, + "logps/chosen": -1.429595708847046, + "logps/rejected": -1.6014493703842163, + "loss": 0.7139, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -1.429595708847046, + "rewards/margins": 0.17185349762439728, + "rewards/rejected": -1.6014493703842163, + "sft_loss": 1.4313127994537354, + "step": 315 + }, + { + "epoch": 0.1712660980097006, + "grad_norm": 4.427112532919185, + "learning_rate": 5.70409982174688e-07, + "logits/chosen": -0.06657937169075012, + "logits/rejected": 0.06765355914831161, + "logps/chosen": -1.3815885782241821, + "logps/rejected": -1.445359230041504, + "loss": 0.7261, + "rewards/accuracies": 0.518750011920929, + "rewards/chosen": -1.3815885782241821, + "rewards/margins": 0.06377061456441879, + "rewards/rejected": -1.445359230041504, + "sft_loss": 1.3856720924377441, + "step": 320 + }, + { + "epoch": 0.17394213079110218, + "grad_norm": 7.529891394125554, + "learning_rate": 5.793226381461676e-07, + "logits/chosen": -0.15291689336299896, + "logits/rejected": -0.03675685077905655, + "logps/chosen": -1.3926328420639038, + "logps/rejected": -1.6686160564422607, + "loss": 0.6705, + "rewards/accuracies": 0.581250011920929, + "rewards/chosen": -1.3926328420639038, + "rewards/margins": 0.2759833335876465, + "rewards/rejected": -1.6686160564422607, + "sft_loss": 1.453553557395935, + "step": 325 + }, + { + "epoch": 0.17661816357250376, + "grad_norm": 10.632517925413287, + "learning_rate": 5.88235294117647e-07, + "logits/chosen": -0.023518884554505348, + "logits/rejected": 0.12941356003284454, + "logps/chosen": -1.4052571058273315, + "logps/rejected": -1.6226956844329834, + "loss": 0.673, + "rewards/accuracies": 0.59375, + "rewards/chosen": -1.4052571058273315, + "rewards/margins": 0.21743862330913544, + "rewards/rejected": -1.6226956844329834, + "sft_loss": 1.4034149646759033, + "step": 330 + }, + { + "epoch": 0.17929419635390534, + "grad_norm": 11.086324499799552, + "learning_rate": 5.971479500891266e-07, + "logits/chosen": 0.021479438990354538, + "logits/rejected": 0.12531518936157227, + "logps/chosen": -1.42387855052948, + "logps/rejected": -1.4794212579727173, + "loss": 0.724, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.42387855052948, + "rewards/margins": 0.05554261803627014, + "rewards/rejected": -1.4794212579727173, + "sft_loss": 1.3950622081756592, + "step": 335 + }, + { + "epoch": 0.18197022913530692, + "grad_norm": 10.388558650210811, + "learning_rate": 6.060606060606061e-07, + "logits/chosen": -0.03921655938029289, + "logits/rejected": 0.11117533594369888, + "logps/chosen": -1.5144026279449463, + "logps/rejected": -1.6030089855194092, + "loss": 0.7484, + "rewards/accuracies": 0.543749988079071, + "rewards/chosen": -1.5144026279449463, + "rewards/margins": 0.08860644698143005, + "rewards/rejected": -1.6030089855194092, + "sft_loss": 1.4564971923828125, + "step": 340 + }, + { + "epoch": 0.1846462619167085, + "grad_norm": 10.196942882530658, + "learning_rate": 6.149732620320855e-07, + "logits/chosen": 0.047799251973629, + "logits/rejected": 0.0809812992811203, + "logps/chosen": -1.4201780557632446, + "logps/rejected": -1.5810493230819702, + "loss": 0.6994, + "rewards/accuracies": 0.543749988079071, + "rewards/chosen": -1.4201780557632446, + "rewards/margins": 0.16087140142917633, + "rewards/rejected": -1.5810493230819702, + "sft_loss": 1.4226887226104736, + "step": 345 + }, + { + "epoch": 0.18732229469811004, + "grad_norm": 8.817739794382337, + "learning_rate": 6.238859180035651e-07, + "logits/chosen": 0.005632379557937384, + "logits/rejected": 0.10080881416797638, + "logps/chosen": -1.3695688247680664, + "logps/rejected": -1.4985499382019043, + "loss": 0.7175, + "rewards/accuracies": 0.543749988079071, + "rewards/chosen": -1.3695688247680664, + "rewards/margins": 0.12898120284080505, + "rewards/rejected": -1.4985499382019043, + "sft_loss": 1.3992822170257568, + "step": 350 + }, + { + "epoch": 0.18999832747951162, + "grad_norm": 6.017069129228117, + "learning_rate": 6.327985739750445e-07, + "logits/chosen": -0.11661320924758911, + "logits/rejected": 0.10363030433654785, + "logps/chosen": -1.4653418064117432, + "logps/rejected": -1.5187338590621948, + "loss": 0.7471, + "rewards/accuracies": 0.48124998807907104, + "rewards/chosen": -1.4653418064117432, + "rewards/margins": 0.05339198186993599, + "rewards/rejected": -1.5187338590621948, + "sft_loss": 1.460890769958496, + "step": 355 + }, + { + "epoch": 0.1926743602609132, + "grad_norm": 5.990445407054706, + "learning_rate": 6.417112299465241e-07, + "logits/chosen": -0.07277899235486984, + "logits/rejected": 0.009725173935294151, + "logps/chosen": -1.4071974754333496, + "logps/rejected": -1.5529217720031738, + "loss": 0.7105, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": -1.4071974754333496, + "rewards/margins": 0.14572428166866302, + "rewards/rejected": -1.5529217720031738, + "sft_loss": 1.3624567985534668, + "step": 360 + }, + { + "epoch": 0.19535039304231477, + "grad_norm": 10.46336449448534, + "learning_rate": 6.506238859180035e-07, + "logits/chosen": -0.016026372089982033, + "logits/rejected": 0.06777580082416534, + "logps/chosen": -1.3677482604980469, + "logps/rejected": -1.4697232246398926, + "loss": 0.7235, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.3677482604980469, + "rewards/margins": 0.10197494179010391, + "rewards/rejected": -1.4697232246398926, + "sft_loss": 1.3233493566513062, + "step": 365 + }, + { + "epoch": 0.19802642582371635, + "grad_norm": 6.70527148965871, + "learning_rate": 6.59536541889483e-07, + "logits/chosen": -0.03155245631933212, + "logits/rejected": 0.06311879307031631, + "logps/chosen": -1.3529279232025146, + "logps/rejected": -1.4192687273025513, + "loss": 0.7291, + "rewards/accuracies": 0.5249999761581421, + "rewards/chosen": -1.3529279232025146, + "rewards/margins": 0.0663408562541008, + "rewards/rejected": -1.4192687273025513, + "sft_loss": 1.3164355754852295, + "step": 370 + }, + { + "epoch": 0.2007024586051179, + "grad_norm": 6.213682685842821, + "learning_rate": 6.684491978609626e-07, + "logits/chosen": -0.10043720155954361, + "logits/rejected": 0.0535179078578949, + "logps/chosen": -1.332228422164917, + "logps/rejected": -1.5000375509262085, + "loss": 0.6931, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": -1.332228422164917, + "rewards/margins": 0.16780902445316315, + "rewards/rejected": -1.5000375509262085, + "sft_loss": 1.3726236820220947, + "step": 375 + }, + { + "epoch": 0.20337849138651948, + "grad_norm": 5.956018695417195, + "learning_rate": 6.77361853832442e-07, + "logits/chosen": -0.06339693814516068, + "logits/rejected": 0.02334422990679741, + "logps/chosen": -1.3552122116088867, + "logps/rejected": -1.5462400913238525, + "loss": 0.6773, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -1.3552122116088867, + "rewards/margins": 0.1910279095172882, + "rewards/rejected": -1.5462400913238525, + "sft_loss": 1.3529400825500488, + "step": 380 + }, + { + "epoch": 0.20605452416792105, + "grad_norm": 4.1253425708857, + "learning_rate": 6.862745098039216e-07, + "logits/chosen": -0.02446773275732994, + "logits/rejected": 0.0541352704167366, + "logps/chosen": -1.4543194770812988, + "logps/rejected": -1.466575026512146, + "loss": 0.7721, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": -1.4543194770812988, + "rewards/margins": 0.012255474925041199, + "rewards/rejected": -1.466575026512146, + "sft_loss": 1.4507054090499878, + "step": 385 + }, + { + "epoch": 0.20873055694932263, + "grad_norm": 8.907745404115765, + "learning_rate": 6.95187165775401e-07, + "logits/chosen": 0.06300806254148483, + "logits/rejected": 0.23759731650352478, + "logps/chosen": -1.454111933708191, + "logps/rejected": -1.5439735651016235, + "loss": 0.7385, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.454111933708191, + "rewards/margins": 0.08986148983240128, + "rewards/rejected": -1.5439735651016235, + "sft_loss": 1.4432179927825928, + "step": 390 + }, + { + "epoch": 0.2114065897307242, + "grad_norm": 7.03912409933743, + "learning_rate": 7.040998217468806e-07, + "logits/chosen": -0.07751835882663727, + "logits/rejected": 0.08364540338516235, + "logps/chosen": -1.403145432472229, + "logps/rejected": -1.4548805952072144, + "loss": 0.7233, + "rewards/accuracies": 0.5375000238418579, + "rewards/chosen": -1.403145432472229, + "rewards/margins": 0.05173531919717789, + "rewards/rejected": -1.4548805952072144, + "sft_loss": 1.3948204517364502, + "step": 395 + }, + { + "epoch": 0.2140826225121258, + "grad_norm": 11.640061370540616, + "learning_rate": 7.1301247771836e-07, + "logits/chosen": 0.048798851668834686, + "logits/rejected": 0.1456696093082428, + "logps/chosen": -1.4081380367279053, + "logps/rejected": -1.5352369546890259, + "loss": 0.7019, + "rewards/accuracies": 0.5562499761581421, + "rewards/chosen": -1.4081380367279053, + "rewards/margins": 0.12709888815879822, + "rewards/rejected": -1.5352369546890259, + "sft_loss": 1.371368646621704, + "step": 400 + }, + { + "epoch": 0.2140826225121258, + "eval_logits/chosen": 0.21375156939029694, + "eval_logits/rejected": 0.2993115484714508, + "eval_logps/chosen": -1.4375065565109253, + "eval_logps/rejected": -1.6032114028930664, + "eval_loss": 0.6977089643478394, + "eval_rewards/accuracies": 0.5630564093589783, + "eval_rewards/chosen": -1.4375065565109253, + "eval_rewards/margins": 0.16570471227169037, + "eval_rewards/rejected": -1.6032114028930664, + "eval_runtime": 49.8197, + "eval_samples_per_second": 26.997, + "eval_sft_loss": 1.4218635559082031, + "eval_steps_per_second": 6.764, + "step": 400 + }, + { + "epoch": 0.21675865529352734, + "grad_norm": 6.375356479218084, + "learning_rate": 7.219251336898395e-07, + "logits/chosen": -0.04546252638101578, + "logits/rejected": 0.04799600690603256, + "logps/chosen": -1.4380704164505005, + "logps/rejected": -1.5441672801971436, + "loss": 0.723, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.4380704164505005, + "rewards/margins": 0.10609670728445053, + "rewards/rejected": -1.5441672801971436, + "sft_loss": 1.3966643810272217, + "step": 405 + }, + { + "epoch": 0.2194346880749289, + "grad_norm": 9.424186700278675, + "learning_rate": 7.30837789661319e-07, + "logits/chosen": -0.0008656397694721818, + "logits/rejected": 0.13088323175907135, + "logps/chosen": -1.4057133197784424, + "logps/rejected": -1.5324071645736694, + "loss": 0.7085, + "rewards/accuracies": 0.5062500238418579, + "rewards/chosen": -1.4057133197784424, + "rewards/margins": 0.12669387459754944, + "rewards/rejected": -1.5324071645736694, + "sft_loss": 1.4035170078277588, + "step": 410 + }, + { + "epoch": 0.2221107208563305, + "grad_norm": 5.854251356791351, + "learning_rate": 7.397504456327985e-07, + "logits/chosen": -0.028660956770181656, + "logits/rejected": 0.012854715809226036, + "logps/chosen": -1.3972389698028564, + "logps/rejected": -1.5771993398666382, + "loss": 0.6901, + "rewards/accuracies": 0.5375000238418579, + "rewards/chosen": -1.3972389698028564, + "rewards/margins": 0.17996013164520264, + "rewards/rejected": -1.5771993398666382, + "sft_loss": 1.380652666091919, + "step": 415 + }, + { + "epoch": 0.22478675363773207, + "grad_norm": 5.815774407814431, + "learning_rate": 7.486631016042781e-07, + "logits/chosen": -0.029275968670845032, + "logits/rejected": 0.16776686906814575, + "logps/chosen": -1.361322283744812, + "logps/rejected": -1.4909484386444092, + "loss": 0.7079, + "rewards/accuracies": 0.5062500238418579, + "rewards/chosen": -1.361322283744812, + "rewards/margins": 0.1296263039112091, + "rewards/rejected": -1.4909484386444092, + "sft_loss": 1.3848352432250977, + "step": 420 + }, + { + "epoch": 0.22746278641913364, + "grad_norm": 7.325771410324191, + "learning_rate": 7.575757575757575e-07, + "logits/chosen": -0.07150163501501083, + "logits/rejected": 0.1298082172870636, + "logps/chosen": -1.417018175125122, + "logps/rejected": -1.6245094537734985, + "loss": 0.6712, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.417018175125122, + "rewards/margins": 0.20749130845069885, + "rewards/rejected": -1.6245094537734985, + "sft_loss": 1.457844853401184, + "step": 425 + }, + { + "epoch": 0.2301388192005352, + "grad_norm": 7.679720354675964, + "learning_rate": 7.664884135472371e-07, + "logits/chosen": -0.11018051207065582, + "logits/rejected": 0.08682509511709213, + "logps/chosen": -1.4306937456130981, + "logps/rejected": -1.6434657573699951, + "loss": 0.6759, + "rewards/accuracies": 0.581250011920929, + "rewards/chosen": -1.4306937456130981, + "rewards/margins": 0.2127722203731537, + "rewards/rejected": -1.6434657573699951, + "sft_loss": 1.4518373012542725, + "step": 430 + }, + { + "epoch": 0.23281485198193677, + "grad_norm": 8.169121159593331, + "learning_rate": 7.754010695187165e-07, + "logits/chosen": -0.02098211646080017, + "logits/rejected": 0.07202951610088348, + "logps/chosen": -1.31760573387146, + "logps/rejected": -1.4644668102264404, + "loss": 0.6876, + "rewards/accuracies": 0.5562499761581421, + "rewards/chosen": -1.31760573387146, + "rewards/margins": 0.14686112105846405, + "rewards/rejected": -1.4644668102264404, + "sft_loss": 1.356937289237976, + "step": 435 + }, + { + "epoch": 0.23549088476333835, + "grad_norm": 5.008493811115046, + "learning_rate": 7.84313725490196e-07, + "logits/chosen": -0.009568731300532818, + "logits/rejected": 0.0893973559141159, + "logps/chosen": -1.3814446926116943, + "logps/rejected": -1.5284905433654785, + "loss": 0.6879, + "rewards/accuracies": 0.5562499761581421, + "rewards/chosen": -1.3814446926116943, + "rewards/margins": 0.14704598486423492, + "rewards/rejected": -1.5284905433654785, + "sft_loss": 1.3872871398925781, + "step": 440 + }, + { + "epoch": 0.23816691754473993, + "grad_norm": 6.714628106326379, + "learning_rate": 7.932263814616755e-07, + "logits/chosen": -0.07186368852853775, + "logits/rejected": 0.04148939996957779, + "logps/chosen": -1.4153398275375366, + "logps/rejected": -1.6106348037719727, + "loss": 0.6975, + "rewards/accuracies": 0.543749988079071, + "rewards/chosen": -1.4153398275375366, + "rewards/margins": 0.19529494643211365, + "rewards/rejected": -1.6106348037719727, + "sft_loss": 1.4277794361114502, + "step": 445 + }, + { + "epoch": 0.2408429503261415, + "grad_norm": 9.004914374033351, + "learning_rate": 8.02139037433155e-07, + "logits/chosen": 0.016086876392364502, + "logits/rejected": 0.14900197088718414, + "logps/chosen": -1.4307498931884766, + "logps/rejected": -1.599519968032837, + "loss": 0.6705, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -1.4307498931884766, + "rewards/margins": 0.1687699854373932, + "rewards/rejected": -1.599519968032837, + "sft_loss": 1.3819632530212402, + "step": 450 + }, + { + "epoch": 0.24351898310754308, + "grad_norm": 6.10990838233586, + "learning_rate": 8.110516934046346e-07, + "logits/chosen": 0.01322667021304369, + "logits/rejected": 0.10899517685174942, + "logps/chosen": -1.3523483276367188, + "logps/rejected": -1.5968722105026245, + "loss": 0.6662, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -1.3523483276367188, + "rewards/margins": 0.24452391266822815, + "rewards/rejected": -1.5968722105026245, + "sft_loss": 1.3561742305755615, + "step": 455 + }, + { + "epoch": 0.24619501588894463, + "grad_norm": 6.549737853320029, + "learning_rate": 8.19964349376114e-07, + "logits/chosen": -0.12617279589176178, + "logits/rejected": 0.0016090974677354097, + "logps/chosen": -1.479893445968628, + "logps/rejected": -1.5853257179260254, + "loss": 0.7197, + "rewards/accuracies": 0.5249999761581421, + "rewards/chosen": -1.479893445968628, + "rewards/margins": 0.1054321900010109, + "rewards/rejected": -1.5853257179260254, + "sft_loss": 1.4918811321258545, + "step": 460 + }, + { + "epoch": 0.2488710486703462, + "grad_norm": 6.783572560298443, + "learning_rate": 8.288770053475936e-07, + "logits/chosen": 0.12295699119567871, + "logits/rejected": 0.14290973544120789, + "logps/chosen": -1.4391229152679443, + "logps/rejected": -1.633167028427124, + "loss": 0.7003, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -1.4391229152679443, + "rewards/margins": 0.1940440833568573, + "rewards/rejected": -1.633167028427124, + "sft_loss": 1.4124929904937744, + "step": 465 + }, + { + "epoch": 0.2515470814517478, + "grad_norm": 6.340493689806178, + "learning_rate": 8.37789661319073e-07, + "logits/chosen": 0.15927724540233612, + "logits/rejected": 0.10868068784475327, + "logps/chosen": -1.3759548664093018, + "logps/rejected": -1.5975778102874756, + "loss": 0.6741, + "rewards/accuracies": 0.581250011920929, + "rewards/chosen": -1.3759548664093018, + "rewards/margins": 0.22162313759326935, + "rewards/rejected": -1.5975778102874756, + "sft_loss": 1.3803473711013794, + "step": 470 + }, + { + "epoch": 0.25422311423314936, + "grad_norm": 5.897942882416216, + "learning_rate": 8.467023172905525e-07, + "logits/chosen": -0.06146723031997681, + "logits/rejected": 0.08610192686319351, + "logps/chosen": -1.4093339443206787, + "logps/rejected": -1.7378123998641968, + "loss": 0.6464, + "rewards/accuracies": 0.6187499761581421, + "rewards/chosen": -1.4093339443206787, + "rewards/margins": 0.3284783959388733, + "rewards/rejected": -1.7378123998641968, + "sft_loss": 1.4273899793624878, + "step": 475 + }, + { + "epoch": 0.2568991470145509, + "grad_norm": 6.98144339229632, + "learning_rate": 8.55614973262032e-07, + "logits/chosen": -0.059163518249988556, + "logits/rejected": 0.14092698693275452, + "logps/chosen": -1.3883863687515259, + "logps/rejected": -1.507129430770874, + "loss": 0.7007, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": -1.3883863687515259, + "rewards/margins": 0.11874298751354218, + "rewards/rejected": -1.507129430770874, + "sft_loss": 1.3975694179534912, + "step": 480 + }, + { + "epoch": 0.2595751797959525, + "grad_norm": 7.168833653699756, + "learning_rate": 8.645276292335115e-07, + "logits/chosen": -0.0012844301527366042, + "logits/rejected": 0.04530390724539757, + "logps/chosen": -1.5146420001983643, + "logps/rejected": -1.6267915964126587, + "loss": 0.7237, + "rewards/accuracies": 0.518750011920929, + "rewards/chosen": -1.5146420001983643, + "rewards/margins": 0.11214945465326309, + "rewards/rejected": -1.6267915964126587, + "sft_loss": 1.478551983833313, + "step": 485 + }, + { + "epoch": 0.26225121257735406, + "grad_norm": 5.927038210209542, + "learning_rate": 8.734402852049911e-07, + "logits/chosen": 0.0074288249015808105, + "logits/rejected": 0.07527925074100494, + "logps/chosen": -1.4502588510513306, + "logps/rejected": -1.5668810606002808, + "loss": 0.7244, + "rewards/accuracies": 0.5562499761581421, + "rewards/chosen": -1.4502588510513306, + "rewards/margins": 0.116622194647789, + "rewards/rejected": -1.5668810606002808, + "sft_loss": 1.4134190082550049, + "step": 490 + }, + { + "epoch": 0.26492724535875567, + "grad_norm": 7.74594919759278, + "learning_rate": 8.823529411764705e-07, + "logits/chosen": -0.05722605064511299, + "logits/rejected": -0.031624868512153625, + "logps/chosen": -1.4506053924560547, + "logps/rejected": -1.5846562385559082, + "loss": 0.7083, + "rewards/accuracies": 0.5249999761581421, + "rewards/chosen": -1.4506053924560547, + "rewards/margins": 0.13405072689056396, + "rewards/rejected": -1.5846562385559082, + "sft_loss": 1.4903643131256104, + "step": 495 + }, + { + "epoch": 0.2676032781401572, + "grad_norm": 5.9093862472476335, + "learning_rate": 8.912655971479501e-07, + "logits/chosen": -0.05097437649965286, + "logits/rejected": 0.05235857516527176, + "logps/chosen": -1.3838527202606201, + "logps/rejected": -1.5711188316345215, + "loss": 0.6943, + "rewards/accuracies": 0.5249999761581421, + "rewards/chosen": -1.3838527202606201, + "rewards/margins": 0.1872660517692566, + "rewards/rejected": -1.5711188316345215, + "sft_loss": 1.3825076818466187, + "step": 500 + }, + { + "epoch": 0.27027931092155877, + "grad_norm": 9.934857874379231, + "learning_rate": 9.001782531194295e-07, + "logits/chosen": -0.09654500335454941, + "logits/rejected": 0.04827988147735596, + "logps/chosen": -1.4918394088745117, + "logps/rejected": -1.5619932413101196, + "loss": 0.7319, + "rewards/accuracies": 0.543749988079071, + "rewards/chosen": -1.4918394088745117, + "rewards/margins": 0.07015396654605865, + "rewards/rejected": -1.5619932413101196, + "sft_loss": 1.4723923206329346, + "step": 505 + }, + { + "epoch": 0.2729553437029604, + "grad_norm": 6.643593763195416, + "learning_rate": 9.09090909090909e-07, + "logits/chosen": 0.07212212681770325, + "logits/rejected": 0.1325608640909195, + "logps/chosen": -1.4403663873672485, + "logps/rejected": -1.6584949493408203, + "loss": 0.6729, + "rewards/accuracies": 0.606249988079071, + "rewards/chosen": -1.4403663873672485, + "rewards/margins": 0.2181284874677658, + "rewards/rejected": -1.6584949493408203, + "sft_loss": 1.3849339485168457, + "step": 510 + }, + { + "epoch": 0.2756313764843619, + "grad_norm": 5.457019277334743, + "learning_rate": 9.180035650623885e-07, + "logits/chosen": 0.0359685979783535, + "logits/rejected": 0.13570207357406616, + "logps/chosen": -1.3674839735031128, + "logps/rejected": -1.5590038299560547, + "loss": 0.6877, + "rewards/accuracies": 0.5375000238418579, + "rewards/chosen": -1.3674839735031128, + "rewards/margins": 0.19151967763900757, + "rewards/rejected": -1.5590038299560547, + "sft_loss": 1.3747385740280151, + "step": 515 + }, + { + "epoch": 0.27830740926576353, + "grad_norm": 5.525007303010948, + "learning_rate": 9.26916221033868e-07, + "logits/chosen": -0.08311732113361359, + "logits/rejected": 0.056603431701660156, + "logps/chosen": -1.423223853111267, + "logps/rejected": -1.5819368362426758, + "loss": 0.702, + "rewards/accuracies": 0.5687500238418579, + "rewards/chosen": -1.423223853111267, + "rewards/margins": 0.1587129533290863, + "rewards/rejected": -1.5819368362426758, + "sft_loss": 1.498355507850647, + "step": 520 + }, + { + "epoch": 0.2809834420471651, + "grad_norm": 11.421683960091872, + "learning_rate": 9.358288770053476e-07, + "logits/chosen": 0.0943758636713028, + "logits/rejected": 0.17001059651374817, + "logps/chosen": -1.4106147289276123, + "logps/rejected": -1.6605732440948486, + "loss": 0.6754, + "rewards/accuracies": 0.59375, + "rewards/chosen": -1.4106147289276123, + "rewards/margins": 0.24995842576026917, + "rewards/rejected": -1.6605732440948486, + "sft_loss": 1.474737524986267, + "step": 525 + }, + { + "epoch": 0.2836594748285666, + "grad_norm": 5.183308574444876, + "learning_rate": 9.44741532976827e-07, + "logits/chosen": 0.06468604505062103, + "logits/rejected": 0.15001121163368225, + "logps/chosen": -1.3696916103363037, + "logps/rejected": -1.5476267337799072, + "loss": 0.6941, + "rewards/accuracies": 0.5625, + "rewards/chosen": -1.3696916103363037, + "rewards/margins": 0.1779351830482483, + "rewards/rejected": -1.5476267337799072, + "sft_loss": 1.3246756792068481, + "step": 530 + }, + { + "epoch": 0.28633550760996823, + "grad_norm": 5.118190343811381, + "learning_rate": 9.536541889483066e-07, + "logits/chosen": -0.10416406393051147, + "logits/rejected": 0.16024169325828552, + "logps/chosen": -1.3779942989349365, + "logps/rejected": -1.509367823600769, + "loss": 0.6947, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -1.3779942989349365, + "rewards/margins": 0.13137365877628326, + "rewards/rejected": -1.509367823600769, + "sft_loss": 1.3444318771362305, + "step": 535 + }, + { + "epoch": 0.2890115403913698, + "grad_norm": 4.741957444228488, + "learning_rate": 9.62566844919786e-07, + "logits/chosen": 0.0058511835522949696, + "logits/rejected": 0.08141259849071503, + "logps/chosen": -1.5181024074554443, + "logps/rejected": -1.6562395095825195, + "loss": 0.7145, + "rewards/accuracies": 0.53125, + "rewards/chosen": -1.5181024074554443, + "rewards/margins": 0.13813704252243042, + "rewards/rejected": -1.6562395095825195, + "sft_loss": 1.5423694849014282, + "step": 540 + }, + { + "epoch": 0.2916875731727714, + "grad_norm": 6.190035659114328, + "learning_rate": 9.714795008912655e-07, + "logits/chosen": -0.09001535922288895, + "logits/rejected": 0.1136549562215805, + "logps/chosen": -1.4268022775650024, + "logps/rejected": -1.618949294090271, + "loss": 0.6708, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": -1.4268022775650024, + "rewards/margins": 0.1921471804380417, + "rewards/rejected": -1.618949294090271, + "sft_loss": 1.430987000465393, + "step": 545 + }, + { + "epoch": 0.29436360595417294, + "grad_norm": 6.200801616546889, + "learning_rate": 9.80392156862745e-07, + "logits/chosen": 0.05723940208554268, + "logits/rejected": 0.1289665400981903, + "logps/chosen": -1.4569398164749146, + "logps/rejected": -1.6450210809707642, + "loss": 0.6722, + "rewards/accuracies": 0.581250011920929, + "rewards/chosen": -1.4569398164749146, + "rewards/margins": 0.18808124959468842, + "rewards/rejected": -1.6450210809707642, + "sft_loss": 1.4220554828643799, + "step": 550 + }, + { + "epoch": 0.2970396387355745, + "grad_norm": 11.232439041213702, + "learning_rate": 9.893048128342244e-07, + "logits/chosen": -0.03822972625494003, + "logits/rejected": 0.09498357772827148, + "logps/chosen": -1.550957441329956, + "logps/rejected": -1.6663873195648193, + "loss": 0.7298, + "rewards/accuracies": 0.5625, + "rewards/chosen": -1.550957441329956, + "rewards/margins": 0.11543013900518417, + "rewards/rejected": -1.6663873195648193, + "sft_loss": 1.5272772312164307, + "step": 555 + }, + { + "epoch": 0.2997156715169761, + "grad_norm": 7.814214005511332, + "learning_rate": 9.98217468805704e-07, + "logits/chosen": 0.07431139051914215, + "logits/rejected": 0.09394214302301407, + "logps/chosen": -1.399718165397644, + "logps/rejected": -1.6008237600326538, + "loss": 0.6723, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -1.399718165397644, + "rewards/margins": 0.20110544562339783, + "rewards/rejected": -1.6008237600326538, + "sft_loss": 1.5155736207962036, + "step": 560 + }, + { + "epoch": 0.30239170429837764, + "grad_norm": 5.516404765486951, + "learning_rate": 9.999984476788462e-07, + "logits/chosen": 0.03611458092927933, + "logits/rejected": 0.0971362441778183, + "logps/chosen": -1.5159542560577393, + "logps/rejected": -1.7062089443206787, + "loss": 0.6873, + "rewards/accuracies": 0.581250011920929, + "rewards/chosen": -1.5159542560577393, + "rewards/margins": 0.1902548372745514, + "rewards/rejected": -1.7062089443206787, + "sft_loss": 1.521512746810913, + "step": 565 + }, + { + "epoch": 0.30506773707977924, + "grad_norm": 10.217360300757083, + "learning_rate": 9.999921413906797e-07, + "logits/chosen": -0.02934812568128109, + "logits/rejected": 0.19856297969818115, + "logps/chosen": -1.4896576404571533, + "logps/rejected": -1.6452884674072266, + "loss": 0.7056, + "rewards/accuracies": 0.5375000238418579, + "rewards/chosen": -1.4896576404571533, + "rewards/margins": 0.15563085675239563, + "rewards/rejected": -1.6452884674072266, + "sft_loss": 1.5293747186660767, + "step": 570 + }, + { + "epoch": 0.3077437698611808, + "grad_norm": 6.126063460820979, + "learning_rate": 9.999809841765644e-07, + "logits/chosen": -0.015764957293868065, + "logits/rejected": 0.047070231288671494, + "logps/chosen": -1.4180388450622559, + "logps/rejected": -1.6173263788223267, + "loss": 0.6796, + "rewards/accuracies": 0.5625, + "rewards/chosen": -1.4180388450622559, + "rewards/margins": 0.1992875337600708, + "rewards/rejected": -1.6173263788223267, + "sft_loss": 1.4322500228881836, + "step": 575 + }, + { + "epoch": 0.3104198026425824, + "grad_norm": 5.442692462822735, + "learning_rate": 9.999649761447477e-07, + "logits/chosen": -0.022562870755791664, + "logits/rejected": 0.13785383105278015, + "logps/chosen": -1.4369986057281494, + "logps/rejected": -1.7041908502578735, + "loss": 0.6478, + "rewards/accuracies": 0.606249988079071, + "rewards/chosen": -1.4369986057281494, + "rewards/margins": 0.26719212532043457, + "rewards/rejected": -1.7041908502578735, + "sft_loss": 1.439800500869751, + "step": 580 + }, + { + "epoch": 0.31309583542398395, + "grad_norm": 7.033602039336304, + "learning_rate": 9.999441174505398e-07, + "logits/chosen": -0.05476289987564087, + "logits/rejected": 0.055011261254549026, + "logps/chosen": -1.6148452758789062, + "logps/rejected": -1.7512447834014893, + "loss": 0.7277, + "rewards/accuracies": 0.5625, + "rewards/chosen": -1.6148452758789062, + "rewards/margins": 0.13639959692955017, + "rewards/rejected": -1.7512447834014893, + "sft_loss": 1.599481225013733, + "step": 585 + }, + { + "epoch": 0.3157718682053855, + "grad_norm": 10.075903252412404, + "learning_rate": 9.999184082963116e-07, + "logits/chosen": -0.011929292231798172, + "logits/rejected": 0.11828957498073578, + "logps/chosen": -1.5608313083648682, + "logps/rejected": -1.6579519510269165, + "loss": 0.721, + "rewards/accuracies": 0.5625, + "rewards/chosen": -1.5608313083648682, + "rewards/margins": 0.09712080657482147, + "rewards/rejected": -1.6579519510269165, + "sft_loss": 1.5709166526794434, + "step": 590 + }, + { + "epoch": 0.3184479009867871, + "grad_norm": 7.47194767395495, + "learning_rate": 9.998878489314937e-07, + "logits/chosen": 0.03971802070736885, + "logits/rejected": 0.17081709206104279, + "logps/chosen": -1.4810861349105835, + "logps/rejected": -1.6688859462738037, + "loss": 0.6828, + "rewards/accuracies": 0.543749988079071, + "rewards/chosen": -1.4810861349105835, + "rewards/margins": 0.18779988586902618, + "rewards/rejected": -1.6688859462738037, + "sft_loss": 1.4875504970550537, + "step": 595 + }, + { + "epoch": 0.32112393376818865, + "grad_norm": 5.715123632371702, + "learning_rate": 9.99852439652573e-07, + "logits/chosen": -0.008820680901408195, + "logits/rejected": 0.14663805067539215, + "logps/chosen": -1.513209581375122, + "logps/rejected": -1.6487150192260742, + "loss": 0.6979, + "rewards/accuracies": 0.5375000238418579, + "rewards/chosen": -1.513209581375122, + "rewards/margins": 0.13550536334514618, + "rewards/rejected": -1.6487150192260742, + "sft_loss": 1.5156606435775757, + "step": 600 + }, + { + "epoch": 0.32379996654959026, + "grad_norm": 7.759220702730302, + "learning_rate": 9.998121808030904e-07, + "logits/chosen": -0.06416098773479462, + "logits/rejected": 0.021558623760938644, + "logps/chosen": -1.6154426336288452, + "logps/rejected": -1.846312165260315, + "loss": 0.6851, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -1.6154426336288452, + "rewards/margins": 0.23086953163146973, + "rewards/rejected": -1.846312165260315, + "sft_loss": 1.6153326034545898, + "step": 605 + }, + { + "epoch": 0.3264759993309918, + "grad_norm": 15.63102859711399, + "learning_rate": 9.997670727736379e-07, + "logits/chosen": 0.06927318871021271, + "logits/rejected": 0.2384120672941208, + "logps/chosen": -1.572723388671875, + "logps/rejected": -1.791204810142517, + "loss": 0.6892, + "rewards/accuracies": 0.59375, + "rewards/chosen": -1.572723388671875, + "rewards/margins": 0.21848134696483612, + "rewards/rejected": -1.791204810142517, + "sft_loss": 1.5574983358383179, + "step": 610 + }, + { + "epoch": 0.32915203211239336, + "grad_norm": 5.568616824813846, + "learning_rate": 9.99717116001853e-07, + "logits/chosen": -0.04871724173426628, + "logits/rejected": 0.055701516568660736, + "logps/chosen": -1.562664270401001, + "logps/rejected": -1.8564201593399048, + "loss": 0.66, + "rewards/accuracies": 0.581250011920929, + "rewards/chosen": -1.562664270401001, + "rewards/margins": 0.29375597834587097, + "rewards/rejected": -1.8564201593399048, + "sft_loss": 1.5629616975784302, + "step": 615 + }, + { + "epoch": 0.33182806489379496, + "grad_norm": 6.02384270726035, + "learning_rate": 9.996623109724173e-07, + "logits/chosen": 0.04730135574936867, + "logits/rejected": 0.11533886194229126, + "logps/chosen": -1.6759907007217407, + "logps/rejected": -1.8699016571044922, + "loss": 0.6981, + "rewards/accuracies": 0.59375, + "rewards/chosen": -1.6759907007217407, + "rewards/margins": 0.19391095638275146, + "rewards/rejected": -1.8699016571044922, + "sft_loss": 1.651746392250061, + "step": 620 + }, + { + "epoch": 0.3345040976751965, + "grad_norm": 8.76762564401349, + "learning_rate": 9.996026582170488e-07, + "logits/chosen": 0.08321405947208405, + "logits/rejected": 0.20709185302257538, + "logps/chosen": -1.5465686321258545, + "logps/rejected": -1.8440536260604858, + "loss": 0.6417, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -1.5465686321258545, + "rewards/margins": 0.297484815120697, + "rewards/rejected": -1.8440536260604858, + "sft_loss": 1.563207745552063, + "step": 625 + }, + { + "epoch": 0.3371801304565981, + "grad_norm": 9.201337338268745, + "learning_rate": 9.995381583144996e-07, + "logits/chosen": 0.020206613466143608, + "logits/rejected": 0.13258489966392517, + "logps/chosen": -1.6068108081817627, + "logps/rejected": -1.890876054763794, + "loss": 0.6448, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -1.6068108081817627, + "rewards/margins": 0.28406545519828796, + "rewards/rejected": -1.890876054763794, + "sft_loss": 1.561948537826538, + "step": 630 + }, + { + "epoch": 0.33985616323799966, + "grad_norm": 5.093884861429177, + "learning_rate": 9.994688118905471e-07, + "logits/chosen": 0.011320598423480988, + "logits/rejected": 0.2581380009651184, + "logps/chosen": -1.7048816680908203, + "logps/rejected": -1.9319698810577393, + "loss": 0.6912, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -1.7048816680908203, + "rewards/margins": 0.22708837687969208, + "rewards/rejected": -1.9319698810577393, + "sft_loss": 1.6976354122161865, + "step": 635 + }, + { + "epoch": 0.3425321960194012, + "grad_norm": 17.736312901847896, + "learning_rate": 9.993946196179912e-07, + "logits/chosen": -0.06087531894445419, + "logits/rejected": 0.1462034285068512, + "logps/chosen": -1.681366205215454, + "logps/rejected": -1.9161033630371094, + "loss": 0.6828, + "rewards/accuracies": 0.53125, + "rewards/chosen": -1.681366205215454, + "rewards/margins": 0.23473712801933289, + "rewards/rejected": -1.9161033630371094, + "sft_loss": 1.709673523902893, + "step": 640 + }, + { + "epoch": 0.3452082288008028, + "grad_norm": 8.314358446774131, + "learning_rate": 9.993155822166455e-07, + "logits/chosen": -0.03272467106580734, + "logits/rejected": 0.05644283443689346, + "logps/chosen": -1.6156227588653564, + "logps/rejected": -1.9258009195327759, + "loss": 0.6386, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -1.6156227588653564, + "rewards/margins": 0.3101782202720642, + "rewards/rejected": -1.9258009195327759, + "sft_loss": 1.5771795511245728, + "step": 645 + }, + { + "epoch": 0.34788426158220437, + "grad_norm": 9.800112089550767, + "learning_rate": 9.992317004533313e-07, + "logits/chosen": 0.0005621820455417037, + "logits/rejected": 0.1446131467819214, + "logps/chosen": -1.7680717706680298, + "logps/rejected": -2.068042278289795, + "loss": 0.6571, + "rewards/accuracies": 0.606249988079071, + "rewards/chosen": -1.7680717706680298, + "rewards/margins": 0.2999705374240875, + "rewards/rejected": -2.068042278289795, + "sft_loss": 1.7745215892791748, + "step": 650 + }, + { + "epoch": 0.350560294363606, + "grad_norm": 8.891072766513426, + "learning_rate": 9.991429751418696e-07, + "logits/chosen": 0.06538809090852737, + "logits/rejected": 0.08319473266601562, + "logps/chosen": -1.7189629077911377, + "logps/rejected": -2.026162624359131, + "loss": 0.6864, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": -1.7189629077911377, + "rewards/margins": 0.3071998655796051, + "rewards/rejected": -2.026162624359131, + "sft_loss": 1.7231667041778564, + "step": 655 + }, + { + "epoch": 0.3532363271450075, + "grad_norm": 8.35891612764567, + "learning_rate": 9.99049407143074e-07, + "logits/chosen": 0.046678341925144196, + "logits/rejected": 0.1763012558221817, + "logps/chosen": -1.7010895013809204, + "logps/rejected": -1.878273367881775, + "loss": 0.705, + "rewards/accuracies": 0.5375000238418579, + "rewards/chosen": -1.7010895013809204, + "rewards/margins": 0.17718379199504852, + "rewards/rejected": -1.878273367881775, + "sft_loss": 1.7096478939056396, + "step": 660 + }, + { + "epoch": 0.35591235992640907, + "grad_norm": 5.872723655675756, + "learning_rate": 9.989509973647416e-07, + "logits/chosen": 0.02392008528113365, + "logits/rejected": 0.16449224948883057, + "logps/chosen": -1.6240675449371338, + "logps/rejected": -1.9025075435638428, + "loss": 0.6608, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -1.6240675449371338, + "rewards/margins": 0.27844005823135376, + "rewards/rejected": -1.9025075435638428, + "sft_loss": 1.6689106225967407, + "step": 665 + }, + { + "epoch": 0.3585883927078107, + "grad_norm": 7.4402063008554835, + "learning_rate": 9.988477467616445e-07, + "logits/chosen": -0.0179903507232666, + "logits/rejected": 0.18532223999500275, + "logps/chosen": -1.6726337671279907, + "logps/rejected": -1.9018869400024414, + "loss": 0.6558, + "rewards/accuracies": 0.5562499761581421, + "rewards/chosen": -1.6726337671279907, + "rewards/margins": 0.22925344109535217, + "rewards/rejected": -1.9018869400024414, + "sft_loss": 1.7706172466278076, + "step": 670 + }, + { + "epoch": 0.3612644254892122, + "grad_norm": 9.22882511476252, + "learning_rate": 9.987396563355205e-07, + "logits/chosen": -0.024912597611546516, + "logits/rejected": 0.04954840987920761, + "logps/chosen": -1.6799808740615845, + "logps/rejected": -2.0296683311462402, + "loss": 0.637, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -1.6799808740615845, + "rewards/margins": 0.3496876657009125, + "rewards/rejected": -2.0296683311462402, + "sft_loss": 1.7392793893814087, + "step": 675 + }, + { + "epoch": 0.36394045827061383, + "grad_norm": 7.178215796653519, + "learning_rate": 9.986267271350631e-07, + "logits/chosen": 0.08153931051492691, + "logits/rejected": 0.24789385497570038, + "logps/chosen": -1.7651420831680298, + "logps/rejected": -2.004348039627075, + "loss": 0.7226, + "rewards/accuracies": 0.5625, + "rewards/chosen": -1.7651420831680298, + "rewards/margins": 0.2392059862613678, + "rewards/rejected": -2.004348039627075, + "sft_loss": 1.7212005853652954, + "step": 680 + }, + { + "epoch": 0.3666164910520154, + "grad_norm": 17.26491478757877, + "learning_rate": 9.985089602559123e-07, + "logits/chosen": 0.037060752511024475, + "logits/rejected": 0.1910381019115448, + "logps/chosen": -1.7412440776824951, + "logps/rejected": -2.0073153972625732, + "loss": 0.6763, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -1.7412440776824951, + "rewards/margins": 0.2660714387893677, + "rewards/rejected": -2.0073153972625732, + "sft_loss": 1.7309767007827759, + "step": 685 + }, + { + "epoch": 0.369292523833417, + "grad_norm": 9.390663949447314, + "learning_rate": 9.983863568406428e-07, + "logits/chosen": 0.07388173043727875, + "logits/rejected": 0.11109878122806549, + "logps/chosen": -1.765343427658081, + "logps/rejected": -2.0304367542266846, + "loss": 0.6819, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -1.765343427658081, + "rewards/margins": 0.26509329676628113, + "rewards/rejected": -2.0304367542266846, + "sft_loss": 1.8135614395141602, + "step": 690 + }, + { + "epoch": 0.37196855661481854, + "grad_norm": 6.290396110648321, + "learning_rate": 9.982589180787532e-07, + "logits/chosen": 0.03160684183239937, + "logits/rejected": 0.11958432197570801, + "logps/chosen": -1.6528549194335938, + "logps/rejected": -1.9509315490722656, + "loss": 0.6564, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -1.6528549194335938, + "rewards/margins": 0.29807668924331665, + "rewards/rejected": -1.9509315490722656, + "sft_loss": 1.7110259532928467, + "step": 695 + }, + { + "epoch": 0.3746445893962201, + "grad_norm": 10.086599389475353, + "learning_rate": 9.981266452066553e-07, + "logits/chosen": -0.09080805629491806, + "logits/rejected": 0.04179905354976654, + "logps/chosen": -1.8715393543243408, + "logps/rejected": -2.0836453437805176, + "loss": 0.6784, + "rewards/accuracies": 0.6187499761581421, + "rewards/chosen": -1.8715393543243408, + "rewards/margins": 0.21210607886314392, + "rewards/rejected": -2.0836453437805176, + "sft_loss": 1.8276889324188232, + "step": 700 + }, + { + "epoch": 0.3773206221776217, + "grad_norm": 9.64164987311192, + "learning_rate": 9.979895395076608e-07, + "logits/chosen": -0.04067766293883324, + "logits/rejected": 0.12535516917705536, + "logps/chosen": -1.8169269561767578, + "logps/rejected": -2.19791841506958, + "loss": 0.6354, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -1.8169269561767578, + "rewards/margins": 0.3809918165206909, + "rewards/rejected": -2.19791841506958, + "sft_loss": 1.835120439529419, + "step": 705 + }, + { + "epoch": 0.37999665495902324, + "grad_norm": 9.28735932443622, + "learning_rate": 9.9784760231197e-07, + "logits/chosen": 0.05085369944572449, + "logits/rejected": 0.1413969099521637, + "logps/chosen": -1.8504364490509033, + "logps/rejected": -2.155524492263794, + "loss": 0.6419, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -1.8504364490509033, + "rewards/margins": 0.3050883710384369, + "rewards/rejected": -2.155524492263794, + "sft_loss": 1.8197228908538818, + "step": 710 + }, + { + "epoch": 0.38267268774042484, + "grad_norm": 12.115110830896828, + "learning_rate": 9.97700834996658e-07, + "logits/chosen": 0.011143917217850685, + "logits/rejected": 0.17604656517505646, + "logps/chosen": -1.9947057962417603, + "logps/rejected": -2.267084836959839, + "loss": 0.6649, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -1.9947057962417603, + "rewards/margins": 0.27237898111343384, + "rewards/rejected": -2.267084836959839, + "sft_loss": 1.9120969772338867, + "step": 715 + }, + { + "epoch": 0.3853487205218264, + "grad_norm": 11.624644878116946, + "learning_rate": 9.97549238985662e-07, + "logits/chosen": 0.08611107617616653, + "logits/rejected": 0.27575188875198364, + "logps/chosen": -2.0711281299591064, + "logps/rejected": -2.4074230194091797, + "loss": 0.6771, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": -2.0711281299591064, + "rewards/margins": 0.33629506826400757, + "rewards/rejected": -2.4074230194091797, + "sft_loss": 2.0715701580047607, + "step": 720 + }, + { + "epoch": 0.38802475330322794, + "grad_norm": 9.108287386730169, + "learning_rate": 9.973928157497674e-07, + "logits/chosen": -0.04209893196821213, + "logits/rejected": 0.10033257305622101, + "logps/chosen": -1.863908052444458, + "logps/rejected": -2.286168336868286, + "loss": 0.6031, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -1.863908052444458, + "rewards/margins": 0.422260582447052, + "rewards/rejected": -2.286168336868286, + "sft_loss": 1.9227043390274048, + "step": 725 + }, + { + "epoch": 0.39070078608462955, + "grad_norm": 14.030613682567305, + "learning_rate": 9.972315668065927e-07, + "logits/chosen": -0.06971832364797592, + "logits/rejected": 0.08316639065742493, + "logps/chosen": -2.050490617752075, + "logps/rejected": -2.3599963188171387, + "loss": 0.6707, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -2.050490617752075, + "rewards/margins": 0.3095058500766754, + "rewards/rejected": -2.3599963188171387, + "sft_loss": 2.0619983673095703, + "step": 730 + }, + { + "epoch": 0.3933768188660311, + "grad_norm": 7.885707770222676, + "learning_rate": 9.97065493720576e-07, + "logits/chosen": -0.050217293202877045, + "logits/rejected": 0.04937911778688431, + "logps/chosen": -2.032588005065918, + "logps/rejected": -2.2886781692504883, + "loss": 0.6714, + "rewards/accuracies": 0.6187499761581421, + "rewards/chosen": -2.032588005065918, + "rewards/margins": 0.2560901343822479, + "rewards/rejected": -2.2886781692504883, + "sft_loss": 2.058027505874634, + "step": 735 + }, + { + "epoch": 0.3960528516474327, + "grad_norm": 15.556154977262148, + "learning_rate": 9.968945981029594e-07, + "logits/chosen": -0.03946347162127495, + "logits/rejected": 0.12837369740009308, + "logps/chosen": -2.0982556343078613, + "logps/rejected": -2.4328927993774414, + "loss": 0.655, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -2.0982556343078613, + "rewards/margins": 0.3346374034881592, + "rewards/rejected": -2.4328927993774414, + "sft_loss": 2.1010241508483887, + "step": 740 + }, + { + "epoch": 0.39872888442883425, + "grad_norm": 6.841990333720714, + "learning_rate": 9.967188816117726e-07, + "logits/chosen": 0.06810925900936127, + "logits/rejected": 0.14391477406024933, + "logps/chosen": -2.0970871448516846, + "logps/rejected": -2.4689412117004395, + "loss": 0.6865, + "rewards/accuracies": 0.6187499761581421, + "rewards/chosen": -2.0970871448516846, + "rewards/margins": 0.37185385823249817, + "rewards/rejected": -2.4689412117004395, + "sft_loss": 2.0899500846862793, + "step": 745 + }, + { + "epoch": 0.4014049172102358, + "grad_norm": 9.963614775990674, + "learning_rate": 9.965383459518179e-07, + "logits/chosen": -0.005502223968505859, + "logits/rejected": 0.15343348681926727, + "logps/chosen": -2.0381226539611816, + "logps/rejected": -2.447211742401123, + "loss": 0.632, + "rewards/accuracies": 0.6875, + "rewards/chosen": -2.0381226539611816, + "rewards/margins": 0.40908899903297424, + "rewards/rejected": -2.447211742401123, + "sft_loss": 2.032761812210083, + "step": 750 + }, + { + "epoch": 0.4040809499916374, + "grad_norm": 9.315325168804089, + "learning_rate": 9.963529928746533e-07, + "logits/chosen": 0.04696832224726677, + "logits/rejected": 0.17964449524879456, + "logps/chosen": -2.0477912425994873, + "logps/rejected": -2.3599047660827637, + "loss": 0.6803, + "rewards/accuracies": 0.6187499761581421, + "rewards/chosen": -2.0477912425994873, + "rewards/margins": 0.31211379170417786, + "rewards/rejected": -2.3599047660827637, + "sft_loss": 2.053529977798462, + "step": 755 + }, + { + "epoch": 0.40675698277303896, + "grad_norm": 6.797423613480653, + "learning_rate": 9.961628241785746e-07, + "logits/chosen": -0.04901718348264694, + "logits/rejected": 0.028940856456756592, + "logps/chosen": -2.0668463706970215, + "logps/rejected": -2.386622905731201, + "loss": 0.6586, + "rewards/accuracies": 0.625, + "rewards/chosen": -2.0668463706970215, + "rewards/margins": 0.3197762370109558, + "rewards/rejected": -2.386622905731201, + "sft_loss": 2.104837656021118, + "step": 760 + }, + { + "epoch": 0.40943301555444056, + "grad_norm": 9.903022378126753, + "learning_rate": 9.959678417085998e-07, + "logits/chosen": 0.015145739540457726, + "logits/rejected": 0.11417007446289062, + "logps/chosen": -1.9795106649398804, + "logps/rejected": -2.254296064376831, + "loss": 0.6665, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": -1.9795106649398804, + "rewards/margins": 0.2747856080532074, + "rewards/rejected": -2.254296064376831, + "sft_loss": 1.9884569644927979, + "step": 765 + }, + { + "epoch": 0.4121090483358421, + "grad_norm": 12.022757569057635, + "learning_rate": 9.957680473564493e-07, + "logits/chosen": 0.09810831397771835, + "logits/rejected": 0.22772392630577087, + "logps/chosen": -1.9476169347763062, + "logps/rejected": -2.4173974990844727, + "loss": 0.6122, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -1.9476169347763062, + "rewards/margins": 0.4697801470756531, + "rewards/rejected": -2.4173974990844727, + "sft_loss": 1.9548561573028564, + "step": 770 + }, + { + "epoch": 0.41478508111724366, + "grad_norm": 6.422764925697639, + "learning_rate": 9.95563443060529e-07, + "logits/chosen": -0.1036531925201416, + "logits/rejected": 0.06811682879924774, + "logps/chosen": -2.0023977756500244, + "logps/rejected": -2.3354830741882324, + "loss": 0.6719, + "rewards/accuracies": 0.6187499761581421, + "rewards/chosen": -2.0023977756500244, + "rewards/margins": 0.3330851197242737, + "rewards/rejected": -2.3354830741882324, + "sft_loss": 1.9613491296768188, + "step": 775 + }, + { + "epoch": 0.41746111389864526, + "grad_norm": 9.881302782545758, + "learning_rate": 9.95354030805911e-07, + "logits/chosen": -0.15749771893024445, + "logits/rejected": -0.008543589152395725, + "logps/chosen": -2.031306028366089, + "logps/rejected": -2.3502228260040283, + "loss": 0.6487, + "rewards/accuracies": 0.6187499761581421, + "rewards/chosen": -2.031306028366089, + "rewards/margins": 0.318916380405426, + "rewards/rejected": -2.3502228260040283, + "sft_loss": 2.0914266109466553, + "step": 780 + }, + { + "epoch": 0.4201371466800468, + "grad_norm": 8.31230234799272, + "learning_rate": 9.951398126243133e-07, + "logits/chosen": -0.02615680918097496, + "logits/rejected": 0.09981267154216766, + "logps/chosen": -1.9661918878555298, + "logps/rejected": -2.4096293449401855, + "loss": 0.6199, + "rewards/accuracies": 0.6187499761581421, + "rewards/chosen": -1.9661918878555298, + "rewards/margins": 0.44343748688697815, + "rewards/rejected": -2.4096293449401855, + "sft_loss": 1.9984050989151, + "step": 785 + }, + { + "epoch": 0.4228131794614484, + "grad_norm": 11.315300797578953, + "learning_rate": 9.94920790594082e-07, + "logits/chosen": -0.060059063136577606, + "logits/rejected": 0.06812725216150284, + "logps/chosen": -1.9654719829559326, + "logps/rejected": -2.4030096530914307, + "loss": 0.6103, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -1.9654719829559326, + "rewards/margins": 0.4375377595424652, + "rewards/rejected": -2.4030096530914307, + "sft_loss": 1.9656639099121094, + "step": 790 + }, + { + "epoch": 0.42548921224284997, + "grad_norm": 22.755632032076623, + "learning_rate": 9.946969668401696e-07, + "logits/chosen": -0.07668205350637436, + "logits/rejected": 0.11146184056997299, + "logps/chosen": -2.0439791679382324, + "logps/rejected": -2.5747227668762207, + "loss": 0.632, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -2.0439791679382324, + "rewards/margins": 0.5307438969612122, + "rewards/rejected": -2.5747227668762207, + "sft_loss": 2.075232744216919, + "step": 795 + }, + { + "epoch": 0.4281652450242516, + "grad_norm": 8.03414191880417, + "learning_rate": 9.944683435341155e-07, + "logits/chosen": -0.04020323604345322, + "logits/rejected": 0.03881923481822014, + "logps/chosen": -2.0494797229766846, + "logps/rejected": -2.466977596282959, + "loss": 0.6225, + "rewards/accuracies": 0.6875, + "rewards/chosen": -2.0494797229766846, + "rewards/margins": 0.41749781370162964, + "rewards/rejected": -2.466977596282959, + "sft_loss": 2.019458055496216, + "step": 800 + }, + { + "epoch": 0.4281652450242516, + "eval_logits/chosen": 0.256985604763031, + "eval_logits/rejected": 0.34294527769088745, + "eval_logps/chosen": -2.0770034790039062, + "eval_logps/rejected": -2.5396463871002197, + "eval_loss": 0.6192032694816589, + "eval_rewards/accuracies": 0.6669139266014099, + "eval_rewards/chosen": -2.0770034790039062, + "eval_rewards/margins": 0.46264272928237915, + "eval_rewards/rejected": -2.5396463871002197, + "eval_runtime": 51.5427, + "eval_samples_per_second": 26.095, + "eval_sft_loss": 2.057264804840088, + "eval_steps_per_second": 6.538, + "step": 800 + }, + { + "epoch": 0.4308412778056531, + "grad_norm": 11.213423217245992, + "learning_rate": 9.942349228940236e-07, + "logits/chosen": -0.08227229118347168, + "logits/rejected": 0.07269458472728729, + "logps/chosen": -2.0386552810668945, + "logps/rejected": -2.636596441268921, + "loss": 0.5819, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -2.0386552810668945, + "rewards/margins": 0.5979411602020264, + "rewards/rejected": -2.636596441268921, + "sft_loss": 2.055975914001465, + "step": 805 + }, + { + "epoch": 0.43351731058705467, + "grad_norm": 9.383016112080094, + "learning_rate": 9.939967071845424e-07, + "logits/chosen": 0.019783342257142067, + "logits/rejected": 0.0919952541589737, + "logps/chosen": -2.1240878105163574, + "logps/rejected": -2.4889702796936035, + "loss": 0.6429, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -2.1240878105163574, + "rewards/margins": 0.3648822009563446, + "rewards/rejected": -2.4889702796936035, + "sft_loss": 2.1461615562438965, + "step": 810 + }, + { + "epoch": 0.4361933433684563, + "grad_norm": 14.285322895558972, + "learning_rate": 9.937536987168413e-07, + "logits/chosen": 0.05799783021211624, + "logits/rejected": 0.18904006481170654, + "logps/chosen": -2.104933977127075, + "logps/rejected": -2.6830811500549316, + "loss": 0.6267, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -2.104933977127075, + "rewards/margins": 0.5781470537185669, + "rewards/rejected": -2.6830811500549316, + "sft_loss": 2.1595168113708496, + "step": 815 + }, + { + "epoch": 0.4388693761498578, + "grad_norm": 11.488414591128015, + "learning_rate": 9.935058998485896e-07, + "logits/chosen": 0.0572284460067749, + "logits/rejected": 0.10458134114742279, + "logps/chosen": -2.1640007495880127, + "logps/rejected": -2.5970141887664795, + "loss": 0.6493, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -2.1640007495880127, + "rewards/margins": 0.43301382660865784, + "rewards/rejected": -2.5970141887664795, + "sft_loss": 2.146779775619507, + "step": 820 + }, + { + "epoch": 0.44154540893125943, + "grad_norm": 17.889890042811206, + "learning_rate": 9.932533129839333e-07, + "logits/chosen": 0.008310935460031033, + "logits/rejected": 0.12683984637260437, + "logps/chosen": -2.0136170387268066, + "logps/rejected": -2.5307841300964355, + "loss": 0.6103, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -2.0136170387268066, + "rewards/margins": 0.5171666145324707, + "rewards/rejected": -2.5307841300964355, + "sft_loss": 2.116117000579834, + "step": 825 + }, + { + "epoch": 0.444221441712661, + "grad_norm": 10.465878733412262, + "learning_rate": 9.929959405734711e-07, + "logits/chosen": 0.10247880220413208, + "logits/rejected": 0.26821228861808777, + "logps/chosen": -2.043735980987549, + "logps/rejected": -2.4065542221069336, + "loss": 0.6423, + "rewards/accuracies": 0.65625, + "rewards/chosen": -2.043735980987549, + "rewards/margins": 0.36281818151474, + "rewards/rejected": -2.4065542221069336, + "sft_loss": 2.042168378829956, + "step": 830 + }, + { + "epoch": 0.44689747449406253, + "grad_norm": 8.980836413849559, + "learning_rate": 9.927337851142314e-07, + "logits/chosen": 0.042201682925224304, + "logits/rejected": 0.1663995385169983, + "logps/chosen": -1.9975996017456055, + "logps/rejected": -2.3808562755584717, + "loss": 0.6439, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -1.9975996017456055, + "rewards/margins": 0.38325658440589905, + "rewards/rejected": -2.3808562755584717, + "sft_loss": 2.103684902191162, + "step": 835 + }, + { + "epoch": 0.44957350727546413, + "grad_norm": 8.385913958085657, + "learning_rate": 9.924668491496474e-07, + "logits/chosen": 0.02358698472380638, + "logits/rejected": 0.19346722960472107, + "logps/chosen": -2.174470901489258, + "logps/rejected": -2.5670955181121826, + "loss": 0.6596, + "rewards/accuracies": 0.625, + "rewards/chosen": -2.174470901489258, + "rewards/margins": 0.3926246762275696, + "rewards/rejected": -2.5670955181121826, + "sft_loss": 2.2588775157928467, + "step": 840 + }, + { + "epoch": 0.4522495400568657, + "grad_norm": 6.069412517957521, + "learning_rate": 9.92195135269533e-07, + "logits/chosen": 0.08882583677768707, + "logits/rejected": 0.1478150337934494, + "logps/chosen": -2.1246047019958496, + "logps/rejected": -2.388331174850464, + "loss": 0.6976, + "rewards/accuracies": 0.5625, + "rewards/chosen": -2.1246047019958496, + "rewards/margins": 0.2637265920639038, + "rewards/rejected": -2.388331174850464, + "sft_loss": 2.252500534057617, + "step": 845 + }, + { + "epoch": 0.4549255728382673, + "grad_norm": 9.418875341176285, + "learning_rate": 9.919186461100574e-07, + "logits/chosen": 0.040856409817934036, + "logits/rejected": 0.1182093620300293, + "logps/chosen": -2.1500442028045654, + "logps/rejected": -2.470322608947754, + "loss": 0.646, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -2.1500442028045654, + "rewards/margins": 0.3202785849571228, + "rewards/rejected": -2.470322608947754, + "sft_loss": 2.192167043685913, + "step": 850 + }, + { + "epoch": 0.45760160561966884, + "grad_norm": 13.78367496344772, + "learning_rate": 9.9163738435372e-07, + "logits/chosen": 0.005407715681940317, + "logits/rejected": 0.15354683995246887, + "logps/chosen": -2.170201301574707, + "logps/rejected": -2.624565601348877, + "loss": 0.6577, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -2.170201301574707, + "rewards/margins": 0.454364150762558, + "rewards/rejected": -2.624565601348877, + "sft_loss": 2.2026968002319336, + "step": 855 + }, + { + "epoch": 0.4602776384010704, + "grad_norm": 7.739859753372353, + "learning_rate": 9.913513527293234e-07, + "logits/chosen": -0.03920990601181984, + "logits/rejected": 0.12746313214302063, + "logps/chosen": -2.2391293048858643, + "logps/rejected": -2.791182279586792, + "loss": 0.6081, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -2.2391293048858643, + "rewards/margins": 0.5520530343055725, + "rewards/rejected": -2.791182279586792, + "sft_loss": 2.3047616481781006, + "step": 860 + }, + { + "epoch": 0.462953671182472, + "grad_norm": 23.85659179365519, + "learning_rate": 9.910605540119474e-07, + "logits/chosen": 0.03437185287475586, + "logits/rejected": 0.1299118548631668, + "logps/chosen": -2.114776849746704, + "logps/rejected": -2.64350962638855, + "loss": 0.6475, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -2.114776849746704, + "rewards/margins": 0.5287328958511353, + "rewards/rejected": -2.64350962638855, + "sft_loss": 2.149594783782959, + "step": 865 + }, + { + "epoch": 0.46562970396387354, + "grad_norm": 7.068050548964723, + "learning_rate": 9.907649910229227e-07, + "logits/chosen": -0.06576119363307953, + "logits/rejected": 0.19930952787399292, + "logps/chosen": -2.1443984508514404, + "logps/rejected": -2.663065195083618, + "loss": 0.6076, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -2.1443984508514404, + "rewards/margins": 0.5186666250228882, + "rewards/rejected": -2.663065195083618, + "sft_loss": 2.212402820587158, + "step": 870 + }, + { + "epoch": 0.46830573674527515, + "grad_norm": 11.088938035410244, + "learning_rate": 9.90464666629803e-07, + "logits/chosen": 0.03996530547738075, + "logits/rejected": 0.12184039503335953, + "logps/chosen": -2.2576634883880615, + "logps/rejected": -2.614593982696533, + "loss": 0.7126, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -2.2576634883880615, + "rewards/margins": 0.35693034529685974, + "rewards/rejected": -2.614593982696533, + "sft_loss": 2.228409767150879, + "step": 875 + }, + { + "epoch": 0.4709817695266767, + "grad_norm": 8.271474030444953, + "learning_rate": 9.901595837463363e-07, + "logits/chosen": 0.0436701737344265, + "logits/rejected": 0.22366786003112793, + "logps/chosen": -2.2589449882507324, + "logps/rejected": -2.7298736572265625, + "loss": 0.6217, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -2.2589449882507324, + "rewards/margins": 0.47092896699905396, + "rewards/rejected": -2.7298736572265625, + "sft_loss": 2.1659958362579346, + "step": 880 + }, + { + "epoch": 0.47365780230807825, + "grad_norm": 9.519114072202234, + "learning_rate": 9.898497453324384e-07, + "logits/chosen": -0.06962232291698456, + "logits/rejected": 0.014059120789170265, + "logps/chosen": -2.2020440101623535, + "logps/rejected": -2.7084178924560547, + "loss": 0.5909, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -2.2020440101623535, + "rewards/margins": 0.5063741207122803, + "rewards/rejected": -2.7084178924560547, + "sft_loss": 2.2662155628204346, + "step": 885 + }, + { + "epoch": 0.47633383508947985, + "grad_norm": 6.855135000012716, + "learning_rate": 9.895351543941628e-07, + "logits/chosen": -0.14909687638282776, + "logits/rejected": -0.0239469762891531, + "logps/chosen": -2.174975872039795, + "logps/rejected": -2.598846673965454, + "loss": 0.6291, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -2.174975872039795, + "rewards/margins": 0.4238705039024353, + "rewards/rejected": -2.598846673965454, + "sft_loss": 2.251343250274658, + "step": 890 + }, + { + "epoch": 0.4790098678708814, + "grad_norm": 11.9995465160683, + "learning_rate": 9.892158139836724e-07, + "logits/chosen": 0.0655425637960434, + "logits/rejected": 0.17073342204093933, + "logps/chosen": -2.043718099594116, + "logps/rejected": -2.407827854156494, + "loss": 0.6423, + "rewards/accuracies": 0.606249988079071, + "rewards/chosen": -2.043718099594116, + "rewards/margins": 0.3641095757484436, + "rewards/rejected": -2.407827854156494, + "sft_loss": 2.106400966644287, + "step": 895 + }, + { + "epoch": 0.481685900652283, + "grad_norm": 9.267739718647467, + "learning_rate": 9.88891727199209e-07, + "logits/chosen": -0.07653886079788208, + "logits/rejected": -0.0005800087237730622, + "logps/chosen": -2.0188803672790527, + "logps/rejected": -2.47680926322937, + "loss": 0.6263, + "rewards/accuracies": 0.6875, + "rewards/chosen": -2.0188803672790527, + "rewards/margins": 0.45792898535728455, + "rewards/rejected": -2.47680926322937, + "sft_loss": 2.075723886489868, + "step": 900 + }, + { + "epoch": 0.48436193343368455, + "grad_norm": 9.434825016785153, + "learning_rate": 9.885628971850641e-07, + "logits/chosen": -0.005064345896244049, + "logits/rejected": 0.18762926757335663, + "logps/chosen": -2.159120559692383, + "logps/rejected": -2.6442129611968994, + "loss": 0.6447, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -2.159120559692383, + "rewards/margins": 0.4850922226905823, + "rewards/rejected": -2.6442129611968994, + "sft_loss": 2.236520528793335, + "step": 905 + }, + { + "epoch": 0.48703796621508616, + "grad_norm": 6.504563353175588, + "learning_rate": 9.882293271315481e-07, + "logits/chosen": -0.020215703174471855, + "logits/rejected": 0.07510758936405182, + "logps/chosen": -2.161691904067993, + "logps/rejected": -2.5602848529815674, + "loss": 0.6645, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -2.161691904067993, + "rewards/margins": 0.3985927700996399, + "rewards/rejected": -2.5602848529815674, + "sft_loss": 2.1818904876708984, + "step": 910 + }, + { + "epoch": 0.4897139989964877, + "grad_norm": 8.088147446053588, + "learning_rate": 9.878910202749589e-07, + "logits/chosen": -0.030447423458099365, + "logits/rejected": 0.1630699634552002, + "logps/chosen": -2.1108880043029785, + "logps/rejected": -2.5778558254241943, + "loss": 0.6127, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -2.1108880043029785, + "rewards/margins": 0.4669678807258606, + "rewards/rejected": -2.5778558254241943, + "sft_loss": 2.157522439956665, + "step": 915 + }, + { + "epoch": 0.49239003177788926, + "grad_norm": 8.554766860204774, + "learning_rate": 9.875479798975512e-07, + "logits/chosen": 0.05800174921751022, + "logits/rejected": 0.2097451388835907, + "logps/chosen": -2.054659366607666, + "logps/rejected": -2.5831289291381836, + "loss": 0.6255, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -2.054659366607666, + "rewards/margins": 0.5284695625305176, + "rewards/rejected": -2.5831289291381836, + "sft_loss": 2.185912609100342, + "step": 920 + }, + { + "epoch": 0.49506606455929086, + "grad_norm": 11.034199446323399, + "learning_rate": 9.87200209327504e-07, + "logits/chosen": -0.027339598163962364, + "logits/rejected": 0.1445145457983017, + "logps/chosen": -2.3099098205566406, + "logps/rejected": -2.737062454223633, + "loss": 0.626, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -2.3099098205566406, + "rewards/margins": 0.427152544260025, + "rewards/rejected": -2.737062454223633, + "sft_loss": 2.298609972000122, + "step": 925 + }, + { + "epoch": 0.4977420973406924, + "grad_norm": 12.015830078755586, + "learning_rate": 9.868477119388894e-07, + "logits/chosen": -0.043401092290878296, + "logits/rejected": 0.07257211208343506, + "logps/chosen": -2.1922214031219482, + "logps/rejected": -2.7571864128112793, + "loss": 0.6264, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -2.1922214031219482, + "rewards/margins": 0.5649651288986206, + "rewards/rejected": -2.7571864128112793, + "sft_loss": 2.2458460330963135, + "step": 930 + }, + { + "epoch": 0.500418130122094, + "grad_norm": 8.727604479465507, + "learning_rate": 9.864904911516383e-07, + "logits/chosen": 0.019951870664954185, + "logits/rejected": 0.06993341445922852, + "logps/chosen": -2.2700858116149902, + "logps/rejected": -2.6822025775909424, + "loss": 0.6514, + "rewards/accuracies": 0.625, + "rewards/chosen": -2.2700858116149902, + "rewards/margins": 0.4121168255805969, + "rewards/rejected": -2.6822025775909424, + "sft_loss": 2.372786283493042, + "step": 935 + }, + { + "epoch": 0.5030941629034956, + "grad_norm": 9.246727041827082, + "learning_rate": 9.861285504315084e-07, + "logits/chosen": -0.009649311192333698, + "logits/rejected": 0.10374053567647934, + "logps/chosen": -2.1735637187957764, + "logps/rejected": -2.6011998653411865, + "loss": 0.6233, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -2.1735637187957764, + "rewards/margins": 0.427636057138443, + "rewards/rejected": -2.6011998653411865, + "sft_loss": 2.2195963859558105, + "step": 940 + }, + { + "epoch": 0.5057701956848971, + "grad_norm": 9.023576880931797, + "learning_rate": 9.857618932900502e-07, + "logits/chosen": -0.045220039784908295, + "logits/rejected": 0.10452475398778915, + "logps/chosen": -2.1646695137023926, + "logps/rejected": -2.719238758087158, + "loss": 0.5769, + "rewards/accuracies": 0.71875, + "rewards/chosen": -2.1646695137023926, + "rewards/margins": 0.554568886756897, + "rewards/rejected": -2.719238758087158, + "sft_loss": 2.2067770957946777, + "step": 945 + }, + { + "epoch": 0.5084462284662987, + "grad_norm": 9.790405508612462, + "learning_rate": 9.853905232845727e-07, + "logits/chosen": -0.0507919080555439, + "logits/rejected": 0.12988582253456116, + "logps/chosen": -2.31115984916687, + "logps/rejected": -2.7202539443969727, + "loss": 0.6656, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -2.31115984916687, + "rewards/margins": 0.4090944230556488, + "rewards/rejected": -2.7202539443969727, + "sft_loss": 2.3003106117248535, + "step": 950 + }, + { + "epoch": 0.5111222612477003, + "grad_norm": 9.675479981290406, + "learning_rate": 9.850144440181095e-07, + "logits/chosen": 0.007676619105041027, + "logits/rejected": 0.23354463279247284, + "logps/chosen": -2.39599347114563, + "logps/rejected": -2.8343722820281982, + "loss": 0.6345, + "rewards/accuracies": 0.65625, + "rewards/chosen": -2.39599347114563, + "rewards/margins": 0.4383786618709564, + "rewards/rejected": -2.8343722820281982, + "sft_loss": 2.4643120765686035, + "step": 955 + }, + { + "epoch": 0.5137982940291018, + "grad_norm": 8.612987321414094, + "learning_rate": 9.846336591393832e-07, + "logits/chosen": -0.025504469871520996, + "logits/rejected": 0.12795531749725342, + "logps/chosen": -2.4095778465270996, + "logps/rejected": -2.8609580993652344, + "loss": 0.635, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -2.4095778465270996, + "rewards/margins": 0.4513804018497467, + "rewards/rejected": -2.8609580993652344, + "sft_loss": 2.4834115505218506, + "step": 960 + }, + { + "epoch": 0.5164743268105034, + "grad_norm": 9.36794990504209, + "learning_rate": 9.842481723427704e-07, + "logits/chosen": 0.09307606518268585, + "logits/rejected": 0.10523343086242676, + "logps/chosen": -2.4937140941619873, + "logps/rejected": -2.9816248416900635, + "loss": 0.658, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -2.4937140941619873, + "rewards/margins": 0.487910658121109, + "rewards/rejected": -2.9816248416900635, + "sft_loss": 2.5522685050964355, + "step": 965 + }, + { + "epoch": 0.519150359591905, + "grad_norm": 8.475134352425844, + "learning_rate": 9.838579873682658e-07, + "logits/chosen": 0.059538520872592926, + "logits/rejected": 0.07206230610609055, + "logps/chosen": -2.302995204925537, + "logps/rejected": -2.6984548568725586, + "loss": 0.6553, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -2.302995204925537, + "rewards/margins": 0.3954595923423767, + "rewards/rejected": -2.6984548568725586, + "sft_loss": 2.3630242347717285, + "step": 970 + }, + { + "epoch": 0.5218263923733065, + "grad_norm": 8.725772671105329, + "learning_rate": 9.834631080014457e-07, + "logits/chosen": -0.06823412328958511, + "logits/rejected": 0.13702434301376343, + "logps/chosen": -2.2840070724487305, + "logps/rejected": -2.745213747024536, + "loss": 0.5947, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -2.2840070724487305, + "rewards/margins": 0.4612065255641937, + "rewards/rejected": -2.745213747024536, + "sft_loss": 2.383709669113159, + "step": 975 + }, + { + "epoch": 0.5245024251547081, + "grad_norm": 14.382330843645065, + "learning_rate": 9.830635380734312e-07, + "logits/chosen": -0.07350875437259674, + "logits/rejected": 0.12759271264076233, + "logps/chosen": -2.4287877082824707, + "logps/rejected": -2.8255655765533447, + "loss": 0.6507, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -2.4287877082824707, + "rewards/margins": 0.39677804708480835, + "rewards/rejected": -2.8255655765533447, + "sft_loss": 2.4610254764556885, + "step": 980 + }, + { + "epoch": 0.5271784579361097, + "grad_norm": 10.325135195234868, + "learning_rate": 9.826592814608517e-07, + "logits/chosen": 0.010499343276023865, + "logits/rejected": 0.22065281867980957, + "logps/chosen": -2.3261592388153076, + "logps/rejected": -2.746830463409424, + "loss": 0.6346, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -2.3261592388153076, + "rewards/margins": 0.42067116498947144, + "rewards/rejected": -2.746830463409424, + "sft_loss": 2.400534152984619, + "step": 985 + }, + { + "epoch": 0.5298544907175113, + "grad_norm": 10.580647081118688, + "learning_rate": 9.822503420858067e-07, + "logits/chosen": 0.09105484187602997, + "logits/rejected": 0.11873571574687958, + "logps/chosen": -2.158053398132324, + "logps/rejected": -2.67893385887146, + "loss": 0.6008, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -2.158053398132324, + "rewards/margins": 0.5208802819252014, + "rewards/rejected": -2.67893385887146, + "sft_loss": 2.3424346446990967, + "step": 990 + }, + { + "epoch": 0.5325305234989128, + "grad_norm": 9.96487400254228, + "learning_rate": 9.818367239158277e-07, + "logits/chosen": 0.09545004367828369, + "logits/rejected": 0.18055422604084015, + "logps/chosen": -2.286459445953369, + "logps/rejected": -2.673666000366211, + "loss": 0.6709, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -2.286459445953369, + "rewards/margins": 0.38720664381980896, + "rewards/rejected": -2.673666000366211, + "sft_loss": 2.4550933837890625, + "step": 995 + }, + { + "epoch": 0.5352065562803144, + "grad_norm": 10.265248076385419, + "learning_rate": 9.8141843096384e-07, + "logits/chosen": 0.06765373051166534, + "logits/rejected": 0.19860796630382538, + "logps/chosen": -2.3884854316711426, + "logps/rejected": -2.9137723445892334, + "loss": 0.6063, + "rewards/accuracies": 0.65625, + "rewards/chosen": -2.3884854316711426, + "rewards/margins": 0.5252869129180908, + "rewards/rejected": -2.9137723445892334, + "sft_loss": 2.4504899978637695, + "step": 1000 + }, + { + "epoch": 0.537882589061716, + "grad_norm": 11.740882940902146, + "learning_rate": 9.809954672881237e-07, + "logits/chosen": 0.05435361713171005, + "logits/rejected": 0.23164144158363342, + "logps/chosen": -2.520017147064209, + "logps/rejected": -2.9663870334625244, + "loss": 0.6627, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -2.520017147064209, + "rewards/margins": 0.44636982679367065, + "rewards/rejected": -2.9663870334625244, + "sft_loss": 2.5899224281311035, + "step": 1005 + }, + { + "epoch": 0.5405586218431175, + "grad_norm": 8.925497581186566, + "learning_rate": 9.80567836992274e-07, + "logits/chosen": 0.03229910135269165, + "logits/rejected": 0.23886017501354218, + "logps/chosen": -2.2775940895080566, + "logps/rejected": -2.876896381378174, + "loss": 0.6006, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -2.2775940895080566, + "rewards/margins": 0.5993021726608276, + "rewards/rejected": -2.876896381378174, + "sft_loss": 2.4006941318511963, + "step": 1010 + }, + { + "epoch": 0.5432346546245191, + "grad_norm": 8.44126902041844, + "learning_rate": 9.801355442251625e-07, + "logits/chosen": 0.0005810469156131148, + "logits/rejected": 0.17584313452243805, + "logps/chosen": -2.3316941261291504, + "logps/rejected": -2.8272385597229004, + "loss": 0.6297, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -2.3316941261291504, + "rewards/margins": 0.49554443359375, + "rewards/rejected": -2.8272385597229004, + "sft_loss": 2.4663829803466797, + "step": 1015 + }, + { + "epoch": 0.5459106874059207, + "grad_norm": 12.02768296687771, + "learning_rate": 9.796985931808949e-07, + "logits/chosen": 0.005703767295926809, + "logits/rejected": 0.15967944264411926, + "logps/chosen": -2.399937391281128, + "logps/rejected": -2.9279513359069824, + "loss": 0.5931, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -2.399937391281128, + "rewards/margins": 0.5280137062072754, + "rewards/rejected": -2.9279513359069824, + "sft_loss": 2.5254905223846436, + "step": 1020 + }, + { + "epoch": 0.5485867201873222, + "grad_norm": 10.475937269154038, + "learning_rate": 9.792569880987724e-07, + "logits/chosen": -0.039773181080818176, + "logits/rejected": 0.0928923636674881, + "logps/chosen": -2.4386563301086426, + "logps/rejected": -3.032752513885498, + "loss": 0.5996, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": -2.4386563301086426, + "rewards/margins": 0.5940964221954346, + "rewards/rejected": -3.032752513885498, + "sft_loss": 2.555467128753662, + "step": 1025 + }, + { + "epoch": 0.5512627529687238, + "grad_norm": 14.15306373994566, + "learning_rate": 9.788107332632493e-07, + "logits/chosen": 0.023439515382051468, + "logits/rejected": 0.11487237364053726, + "logps/chosen": -2.4161739349365234, + "logps/rejected": -2.874342918395996, + "loss": 0.6551, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -2.4161739349365234, + "rewards/margins": 0.458169162273407, + "rewards/rejected": -2.874342918395996, + "sft_loss": 2.5372936725616455, + "step": 1030 + }, + { + "epoch": 0.5539387857501255, + "grad_norm": 8.847741885689816, + "learning_rate": 9.783598330038924e-07, + "logits/chosen": -0.024633217602968216, + "logits/rejected": 0.10054673254489899, + "logps/chosen": -2.4228594303131104, + "logps/rejected": -2.8241658210754395, + "loss": 0.6474, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -2.4228594303131104, + "rewards/margins": 0.4013066291809082, + "rewards/rejected": -2.8241658210754395, + "sft_loss": 2.4797160625457764, + "step": 1035 + }, + { + "epoch": 0.5566148185315271, + "grad_norm": 13.358408093082733, + "learning_rate": 9.779042916953376e-07, + "logits/chosen": 0.022121794521808624, + "logits/rejected": 0.20207636058330536, + "logps/chosen": -2.1366634368896484, + "logps/rejected": -2.747373104095459, + "loss": 0.5924, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -2.1366634368896484, + "rewards/margins": 0.6107093095779419, + "rewards/rejected": -2.747373104095459, + "sft_loss": 2.242213487625122, + "step": 1040 + }, + { + "epoch": 0.5592908513129285, + "grad_norm": 7.435232618941414, + "learning_rate": 9.774441137572487e-07, + "logits/chosen": -0.04853493347764015, + "logits/rejected": 0.11352036148309708, + "logps/chosen": -2.305412769317627, + "logps/rejected": -2.8964731693267822, + "loss": 0.586, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -2.305412769317627, + "rewards/margins": 0.5910605788230896, + "rewards/rejected": -2.8964731693267822, + "sft_loss": 2.4367892742156982, + "step": 1045 + }, + { + "epoch": 0.5619668840943302, + "grad_norm": 10.496072454599288, + "learning_rate": 9.76979303654274e-07, + "logits/chosen": -0.07789639383554459, + "logits/rejected": 0.03533010184764862, + "logps/chosen": -2.4677746295928955, + "logps/rejected": -3.0698654651641846, + "loss": 0.595, + "rewards/accuracies": 0.6875, + "rewards/chosen": -2.4677746295928955, + "rewards/margins": 0.6020905375480652, + "rewards/rejected": -3.0698654651641846, + "sft_loss": 2.5504465103149414, + "step": 1050 + }, + { + "epoch": 0.5646429168757318, + "grad_norm": 12.156821575710055, + "learning_rate": 9.765098658960035e-07, + "logits/chosen": 0.011598305776715279, + "logits/rejected": 0.09234117716550827, + "logps/chosen": -2.443835496902466, + "logps/rejected": -2.99617600440979, + "loss": 0.5969, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -2.443835496902466, + "rewards/margins": 0.5523403882980347, + "rewards/rejected": -2.99617600440979, + "sft_loss": 2.48038649559021, + "step": 1055 + }, + { + "epoch": 0.5673189496571333, + "grad_norm": 12.482189019423522, + "learning_rate": 9.76035805036924e-07, + "logits/chosen": 0.06962551921606064, + "logits/rejected": 0.2640361189842224, + "logps/chosen": -2.56504487991333, + "logps/rejected": -3.0283312797546387, + "loss": 0.6309, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -2.56504487991333, + "rewards/margins": 0.4632865786552429, + "rewards/rejected": -3.0283312797546387, + "sft_loss": 2.5605111122131348, + "step": 1060 + }, + { + "epoch": 0.5699949824385349, + "grad_norm": 10.862661070793598, + "learning_rate": 9.755571256763764e-07, + "logits/chosen": 0.050166137516498566, + "logits/rejected": 0.18997737765312195, + "logps/chosen": -2.4118528366088867, + "logps/rejected": -3.04915189743042, + "loss": 0.5701, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -2.4118528366088867, + "rewards/margins": 0.6372988820075989, + "rewards/rejected": -3.04915189743042, + "sft_loss": 2.5698437690734863, + "step": 1065 + }, + { + "epoch": 0.5726710152199365, + "grad_norm": 8.554618594416997, + "learning_rate": 9.750738324585097e-07, + "logits/chosen": -0.05651255324482918, + "logits/rejected": 0.20479531586170197, + "logps/chosen": -2.5034983158111572, + "logps/rejected": -3.0640318393707275, + "loss": 0.592, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -2.5034983158111572, + "rewards/margins": 0.5605336427688599, + "rewards/rejected": -3.0640318393707275, + "sft_loss": 2.6030170917510986, + "step": 1070 + }, + { + "epoch": 0.5753470480013381, + "grad_norm": 7.239548206687595, + "learning_rate": 9.74585930072237e-07, + "logits/chosen": 0.032971903681755066, + "logits/rejected": 0.17771092057228088, + "logps/chosen": -2.311800956726074, + "logps/rejected": -2.924129009246826, + "loss": 0.6031, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -2.311800956726074, + "rewards/margins": 0.6123279929161072, + "rewards/rejected": -2.924129009246826, + "sft_loss": 2.4453539848327637, + "step": 1075 + }, + { + "epoch": 0.5780230807827396, + "grad_norm": 9.823708542127223, + "learning_rate": 9.740934232511892e-07, + "logits/chosen": -0.05281168222427368, + "logits/rejected": 0.06951533257961273, + "logps/chosen": -2.492975950241089, + "logps/rejected": -3.0146288871765137, + "loss": 0.6193, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -2.492975950241089, + "rewards/margins": 0.52165287733078, + "rewards/rejected": -3.0146288871765137, + "sft_loss": 2.663308620452881, + "step": 1080 + }, + { + "epoch": 0.5806991135641412, + "grad_norm": 11.971129664715203, + "learning_rate": 9.735963167736698e-07, + "logits/chosen": 0.019016049802303314, + "logits/rejected": 0.2044394314289093, + "logps/chosen": -2.41227388381958, + "logps/rejected": -2.7654335498809814, + "loss": 0.6744, + "rewards/accuracies": 0.6187499761581421, + "rewards/chosen": -2.41227388381958, + "rewards/margins": 0.35315990447998047, + "rewards/rejected": -2.7654335498809814, + "sft_loss": 2.4880402088165283, + "step": 1085 + }, + { + "epoch": 0.5833751463455428, + "grad_norm": 8.836003071123491, + "learning_rate": 9.730946154626078e-07, + "logits/chosen": 0.028936797752976418, + "logits/rejected": 0.1407909095287323, + "logps/chosen": -2.507131576538086, + "logps/rejected": -2.9046547412872314, + "loss": 0.672, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -2.507131576538086, + "rewards/margins": 0.397522896528244, + "rewards/rejected": -2.9046547412872314, + "sft_loss": 2.6242282390594482, + "step": 1090 + }, + { + "epoch": 0.5860511791269443, + "grad_norm": 13.877213002027947, + "learning_rate": 9.725883241855117e-07, + "logits/chosen": -0.10876253992319107, + "logits/rejected": 0.05406232550740242, + "logps/chosen": -2.434305191040039, + "logps/rejected": -3.0059266090393066, + "loss": 0.5916, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -2.434305191040039, + "rewards/margins": 0.5716217756271362, + "rewards/rejected": -3.0059266090393066, + "sft_loss": 2.587421178817749, + "step": 1095 + }, + { + "epoch": 0.5887272119083459, + "grad_norm": 9.283286246160356, + "learning_rate": 9.720774478544218e-07, + "logits/chosen": 0.015044411644339561, + "logits/rejected": 0.1409268081188202, + "logps/chosen": -2.320230722427368, + "logps/rejected": -2.980865478515625, + "loss": 0.6017, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -2.320230722427368, + "rewards/margins": 0.6606348752975464, + "rewards/rejected": -2.980865478515625, + "sft_loss": 2.453669309616089, + "step": 1100 + }, + { + "epoch": 0.5914032446897475, + "grad_norm": 9.587947641293601, + "learning_rate": 9.715619914258624e-07, + "logits/chosen": -0.07415847480297089, + "logits/rejected": 0.025357436388731003, + "logps/chosen": -2.4371161460876465, + "logps/rejected": -2.8709144592285156, + "loss": 0.6434, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -2.4371161460876465, + "rewards/margins": 0.4337983727455139, + "rewards/rejected": -2.8709144592285156, + "sft_loss": 2.452127456665039, + "step": 1105 + }, + { + "epoch": 0.594079277471149, + "grad_norm": 12.43499144947583, + "learning_rate": 9.710419599007937e-07, + "logits/chosen": -0.020220275968313217, + "logits/rejected": 0.12485732138156891, + "logps/chosen": -2.4217236042022705, + "logps/rejected": -2.8118643760681152, + "loss": 0.6412, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -2.4217236042022705, + "rewards/margins": 0.39014047384262085, + "rewards/rejected": -2.8118643760681152, + "sft_loss": 2.513850212097168, + "step": 1110 + }, + { + "epoch": 0.5967553102525506, + "grad_norm": 11.869515874986861, + "learning_rate": 9.705173583245643e-07, + "logits/chosen": 0.03815209120512009, + "logits/rejected": 0.20307651162147522, + "logps/chosen": -2.2028329372406006, + "logps/rejected": -2.7756314277648926, + "loss": 0.5967, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -2.2028329372406006, + "rewards/margins": 0.5727984309196472, + "rewards/rejected": -2.7756314277648926, + "sft_loss": 2.2485616207122803, + "step": 1115 + }, + { + "epoch": 0.5994313430339522, + "grad_norm": 7.423096137669163, + "learning_rate": 9.699881917868609e-07, + "logits/chosen": -0.13720020651817322, + "logits/rejected": -0.012343773618340492, + "logps/chosen": -2.245342969894409, + "logps/rejected": -2.795599937438965, + "loss": 0.6001, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -2.245342969894409, + "rewards/margins": 0.55025714635849, + "rewards/rejected": -2.795599937438965, + "sft_loss": 2.408857822418213, + "step": 1120 + }, + { + "epoch": 0.6021073758153538, + "grad_norm": 9.075458489681749, + "learning_rate": 9.694544654216594e-07, + "logits/chosen": -0.1065760999917984, + "logits/rejected": 0.10421280562877655, + "logps/chosen": -2.33565092086792, + "logps/rejected": -2.943451404571533, + "loss": 0.5765, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -2.33565092086792, + "rewards/margins": 0.6078003644943237, + "rewards/rejected": -2.943451404571533, + "sft_loss": 2.3767237663269043, + "step": 1125 + }, + { + "epoch": 0.6047834085967553, + "grad_norm": 12.459931336585251, + "learning_rate": 9.689161844071755e-07, + "logits/chosen": 0.05389224365353584, + "logits/rejected": 0.1359303891658783, + "logps/chosen": -2.204026699066162, + "logps/rejected": -2.6704185009002686, + "loss": 0.6297, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -2.204026699066162, + "rewards/margins": 0.4663916230201721, + "rewards/rejected": -2.6704185009002686, + "sft_loss": 2.2297959327697754, + "step": 1130 + }, + { + "epoch": 0.6074594413781569, + "grad_norm": 9.910833434965273, + "learning_rate": 9.683733539658138e-07, + "logits/chosen": -0.034460533410310745, + "logits/rejected": 0.15427373349666595, + "logps/chosen": -2.3623645305633545, + "logps/rejected": -2.85990047454834, + "loss": 0.6379, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -2.3623645305633545, + "rewards/margins": 0.4975360929965973, + "rewards/rejected": -2.85990047454834, + "sft_loss": 2.3342556953430176, + "step": 1135 + }, + { + "epoch": 0.6101354741595585, + "grad_norm": 10.119642658770305, + "learning_rate": 9.678259793641178e-07, + "logits/chosen": -0.005990887992084026, + "logits/rejected": 0.048573773354291916, + "logps/chosen": -2.343507766723633, + "logps/rejected": -2.6922554969787598, + "loss": 0.6553, + "rewards/accuracies": 0.625, + "rewards/chosen": -2.343507766723633, + "rewards/margins": 0.3487474322319031, + "rewards/rejected": -2.6922554969787598, + "sft_loss": 2.4332337379455566, + "step": 1140 + }, + { + "epoch": 0.61281150694096, + "grad_norm": 8.549538226355667, + "learning_rate": 9.672740659127183e-07, + "logits/chosen": -0.14116328954696655, + "logits/rejected": 0.010193651542067528, + "logps/chosen": -2.280282974243164, + "logps/rejected": -2.8611793518066406, + "loss": 0.6175, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -2.280282974243164, + "rewards/margins": 0.580896258354187, + "rewards/rejected": -2.8611793518066406, + "sft_loss": 2.414921283721924, + "step": 1145 + }, + { + "epoch": 0.6154875397223616, + "grad_norm": 9.265583223855089, + "learning_rate": 9.667176189662818e-07, + "logits/chosen": -0.11378173530101776, + "logits/rejected": 0.03218189626932144, + "logps/chosen": -2.2497658729553223, + "logps/rejected": -2.8140835762023926, + "loss": 0.5923, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -2.2497658729553223, + "rewards/margins": 0.5643175840377808, + "rewards/rejected": -2.8140835762023926, + "sft_loss": 2.3082830905914307, + "step": 1150 + }, + { + "epoch": 0.6181635725037632, + "grad_norm": 8.945456930621933, + "learning_rate": 9.661566439234592e-07, + "logits/chosen": -0.010784052312374115, + "logits/rejected": 0.10385878384113312, + "logps/chosen": -2.259997606277466, + "logps/rejected": -2.689664840698242, + "loss": 0.6281, + "rewards/accuracies": 0.65625, + "rewards/chosen": -2.259997606277466, + "rewards/margins": 0.42966747283935547, + "rewards/rejected": -2.689664840698242, + "sft_loss": 2.386997699737549, + "step": 1155 + }, + { + "epoch": 0.6208396052851648, + "grad_norm": 9.913210782246972, + "learning_rate": 9.655911462268327e-07, + "logits/chosen": 0.058179665356874466, + "logits/rejected": 0.17801596224308014, + "logps/chosen": -2.320453643798828, + "logps/rejected": -2.9494757652282715, + "loss": 0.547, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": -2.320453643798828, + "rewards/margins": 0.6290220022201538, + "rewards/rejected": -2.9494757652282715, + "sft_loss": 2.536233901977539, + "step": 1160 + }, + { + "epoch": 0.6235156380665663, + "grad_norm": 9.03448641219398, + "learning_rate": 9.650211313628636e-07, + "logits/chosen": -0.026484167203307152, + "logits/rejected": 0.07927101105451584, + "logps/chosen": -2.41178560256958, + "logps/rejected": -2.841890811920166, + "loss": 0.6468, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -2.41178560256958, + "rewards/margins": 0.4301057457923889, + "rewards/rejected": -2.841890811920166, + "sft_loss": 2.56024169921875, + "step": 1165 + }, + { + "epoch": 0.6261916708479679, + "grad_norm": 7.4627355054266555, + "learning_rate": 9.644466048618386e-07, + "logits/chosen": -0.013878998346626759, + "logits/rejected": 0.15998892486095428, + "logps/chosen": -2.7046947479248047, + "logps/rejected": -3.2395119667053223, + "loss": 0.6503, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -2.7046947479248047, + "rewards/margins": 0.534817099571228, + "rewards/rejected": -3.2395119667053223, + "sft_loss": 2.7282543182373047, + "step": 1170 + }, + { + "epoch": 0.6288677036293695, + "grad_norm": 8.984602343546689, + "learning_rate": 9.63867572297816e-07, + "logits/chosen": -0.025428790599107742, + "logits/rejected": 0.18897771835327148, + "logps/chosen": -2.431147336959839, + "logps/rejected": -2.9441728591918945, + "loss": 0.625, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -2.431147336959839, + "rewards/margins": 0.5130254030227661, + "rewards/rejected": -2.9441728591918945, + "sft_loss": 2.5837855339050293, + "step": 1175 + }, + { + "epoch": 0.631543736410771, + "grad_norm": 10.646614025720375, + "learning_rate": 9.632840392885727e-07, + "logits/chosen": -0.05773182958364487, + "logits/rejected": 0.12018024921417236, + "logps/chosen": -2.6617088317871094, + "logps/rejected": -3.2409186363220215, + "loss": 0.6207, + "rewards/accuracies": 0.6875, + "rewards/chosen": -2.6617088317871094, + "rewards/margins": 0.5792103409767151, + "rewards/rejected": -3.2409186363220215, + "sft_loss": 2.7341196537017822, + "step": 1180 + }, + { + "epoch": 0.6342197691921726, + "grad_norm": 8.685344328099887, + "learning_rate": 9.626960114955483e-07, + "logits/chosen": 0.0071663991548120975, + "logits/rejected": 0.1699160784482956, + "logps/chosen": -2.555429458618164, + "logps/rejected": -3.1707100868225098, + "loss": 0.6035, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -2.555429458618164, + "rewards/margins": 0.6152806282043457, + "rewards/rejected": -3.1707100868225098, + "sft_loss": 2.6178455352783203, + "step": 1185 + }, + { + "epoch": 0.6368958019735742, + "grad_norm": 11.533136482896289, + "learning_rate": 9.621034946237909e-07, + "logits/chosen": -0.05947018787264824, + "logits/rejected": 0.1066945418715477, + "logps/chosen": -2.5827643871307373, + "logps/rejected": -3.259899854660034, + "loss": 0.5655, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -2.5827643871307373, + "rewards/margins": 0.6771354675292969, + "rewards/rejected": -3.259899854660034, + "sft_loss": 2.7543718814849854, + "step": 1190 + }, + { + "epoch": 0.6395718347549757, + "grad_norm": 9.564838984193049, + "learning_rate": 9.615064944219021e-07, + "logits/chosen": 0.01643727719783783, + "logits/rejected": 0.1448705494403839, + "logps/chosen": -2.372252941131592, + "logps/rejected": -3.0352444648742676, + "loss": 0.5645, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -2.372252941131592, + "rewards/margins": 0.6629914045333862, + "rewards/rejected": -3.0352444648742676, + "sft_loss": 2.5685365200042725, + "step": 1195 + }, + { + "epoch": 0.6422478675363773, + "grad_norm": 15.41012291667016, + "learning_rate": 9.609050166819803e-07, + "logits/chosen": -0.037835728377103806, + "logits/rejected": 0.043718576431274414, + "logps/chosen": -2.5800116062164307, + "logps/rejected": -3.1163458824157715, + "loss": 0.6242, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -2.5800116062164307, + "rewards/margins": 0.5363345146179199, + "rewards/rejected": -3.1163458824157715, + "sft_loss": 2.6192474365234375, + "step": 1200 + }, + { + "epoch": 0.6422478675363773, + "eval_logits/chosen": 0.41017210483551025, + "eval_logits/rejected": 0.523736834526062, + "eval_logps/chosen": -2.484957456588745, + "eval_logps/rejected": -3.1039416790008545, + "eval_loss": 0.5881574749946594, + "eval_rewards/accuracies": 0.6973294019699097, + "eval_rewards/chosen": -2.484957456588745, + "eval_rewards/margins": 0.618984043598175, + "eval_rewards/rejected": -3.1039416790008545, + "eval_runtime": 52.8258, + "eval_samples_per_second": 25.461, + "eval_sft_loss": 2.6278696060180664, + "eval_steps_per_second": 6.379, + "step": 1200 + }, + { + "epoch": 0.6449239003177789, + "grad_norm": 16.223702023822415, + "learning_rate": 9.602990672395653e-07, + "logits/chosen": -0.12028829753398895, + "logits/rejected": 0.09198782593011856, + "logps/chosen": -2.447338342666626, + "logps/rejected": -3.0379064083099365, + "loss": 0.5845, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -2.447338342666626, + "rewards/margins": 0.5905681848526001, + "rewards/rejected": -3.0379064083099365, + "sft_loss": 2.548774003982544, + "step": 1205 + }, + { + "epoch": 0.6475999330991805, + "grad_norm": 10.103767921832775, + "learning_rate": 9.59688651973581e-07, + "logits/chosen": -0.04002942889928818, + "logits/rejected": 0.17909970879554749, + "logps/chosen": -2.391770124435425, + "logps/rejected": -2.9228744506835938, + "loss": 0.5863, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -2.391770124435425, + "rewards/margins": 0.5311041474342346, + "rewards/rejected": -2.9228744506835938, + "sft_loss": 2.507202625274658, + "step": 1210 + }, + { + "epoch": 0.650275965880582, + "grad_norm": 9.570882970595996, + "learning_rate": 9.590737768062792e-07, + "logits/chosen": -0.08841636031866074, + "logits/rejected": 0.05188380554318428, + "logps/chosen": -2.521583080291748, + "logps/rejected": -2.9929375648498535, + "loss": 0.6329, + "rewards/accuracies": 0.6875, + "rewards/chosen": -2.521583080291748, + "rewards/margins": 0.47135478258132935, + "rewards/rejected": -2.9929375648498535, + "sft_loss": 2.6055068969726562, + "step": 1215 + }, + { + "epoch": 0.6529519986619836, + "grad_norm": 9.152708059844768, + "learning_rate": 9.584544477031816e-07, + "logits/chosen": 0.09380490332841873, + "logits/rejected": 0.22841504216194153, + "logps/chosen": -2.2076313495635986, + "logps/rejected": -2.7225444316864014, + "loss": 0.6128, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -2.2076313495635986, + "rewards/margins": 0.5149132013320923, + "rewards/rejected": -2.7225444316864014, + "sft_loss": 2.280226707458496, + "step": 1220 + }, + { + "epoch": 0.6556280314433852, + "grad_norm": 10.507226530549053, + "learning_rate": 9.578306706730215e-07, + "logits/chosen": -0.13376358151435852, + "logits/rejected": 0.10572312772274017, + "logps/chosen": -2.4945626258850098, + "logps/rejected": -2.9013962745666504, + "loss": 0.6674, + "rewards/accuracies": 0.65625, + "rewards/chosen": -2.4945626258850098, + "rewards/margins": 0.40683332085609436, + "rewards/rejected": -2.9013962745666504, + "sft_loss": 2.568493366241455, + "step": 1225 + }, + { + "epoch": 0.6583040642247867, + "grad_norm": 11.20669415454124, + "learning_rate": 9.572024517676865e-07, + "logits/chosen": -0.012587158009409904, + "logits/rejected": 0.10158940404653549, + "logps/chosen": -2.4850094318389893, + "logps/rejected": -2.9105517864227295, + "loss": 0.649, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -2.4850094318389893, + "rewards/margins": 0.4255426526069641, + "rewards/rejected": -2.9105517864227295, + "sft_loss": 2.536282777786255, + "step": 1230 + }, + { + "epoch": 0.6609800970061883, + "grad_norm": 8.63873296459481, + "learning_rate": 9.565697970821593e-07, + "logits/chosen": 0.01712236925959587, + "logits/rejected": 0.17170551419258118, + "logps/chosen": -2.3881125450134277, + "logps/rejected": -2.8042643070220947, + "loss": 0.6276, + "rewards/accuracies": 0.65625, + "rewards/chosen": -2.3881125450134277, + "rewards/margins": 0.41615214943885803, + "rewards/rejected": -2.8042643070220947, + "sft_loss": 2.5537092685699463, + "step": 1235 + }, + { + "epoch": 0.6636561297875899, + "grad_norm": 9.518280928455988, + "learning_rate": 9.559327127544585e-07, + "logits/chosen": -0.09290903806686401, + "logits/rejected": 0.05830955505371094, + "logps/chosen": -2.2869534492492676, + "logps/rejected": -2.7384681701660156, + "loss": 0.6042, + "rewards/accuracies": 0.65625, + "rewards/chosen": -2.2869534492492676, + "rewards/margins": 0.45151472091674805, + "rewards/rejected": -2.7384681701660156, + "sft_loss": 2.4760611057281494, + "step": 1240 + }, + { + "epoch": 0.6663321625689914, + "grad_norm": 10.55673759047217, + "learning_rate": 9.552912049655789e-07, + "logits/chosen": -0.01964261755347252, + "logits/rejected": 0.18300272524356842, + "logps/chosen": -2.2489116191864014, + "logps/rejected": -2.731536626815796, + "loss": 0.6167, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -2.2489116191864014, + "rewards/margins": 0.4826253056526184, + "rewards/rejected": -2.731536626815796, + "sft_loss": 2.314009189605713, + "step": 1245 + }, + { + "epoch": 0.669008195350393, + "grad_norm": 12.081308503661694, + "learning_rate": 9.546452799394315e-07, + "logits/chosen": -0.00539967929944396, + "logits/rejected": 0.21461212635040283, + "logps/chosen": -2.4177112579345703, + "logps/rejected": -2.815523147583008, + "loss": 0.6565, + "rewards/accuracies": 0.581250011920929, + "rewards/chosen": -2.4177112579345703, + "rewards/margins": 0.3978119492530823, + "rewards/rejected": -2.815523147583008, + "sft_loss": 2.4331181049346924, + "step": 1250 + }, + { + "epoch": 0.6716842281317946, + "grad_norm": 11.044936559753248, + "learning_rate": 9.539949439427846e-07, + "logits/chosen": -0.0206326711922884, + "logits/rejected": 0.1170809417963028, + "logps/chosen": -2.331955671310425, + "logps/rejected": -2.8401269912719727, + "loss": 0.61, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -2.331955671310425, + "rewards/margins": 0.5081711411476135, + "rewards/rejected": -2.8401269912719727, + "sft_loss": 2.543001651763916, + "step": 1255 + }, + { + "epoch": 0.6743602609131962, + "grad_norm": 8.165133631615923, + "learning_rate": 9.533402032852002e-07, + "logits/chosen": -0.05226556584239006, + "logits/rejected": 0.09908227622509003, + "logps/chosen": -2.407578468322754, + "logps/rejected": -3.0979156494140625, + "loss": 0.5715, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -2.407578468322754, + "rewards/margins": 0.6903371214866638, + "rewards/rejected": -3.0979156494140625, + "sft_loss": 2.6117091178894043, + "step": 1260 + }, + { + "epoch": 0.6770362936945977, + "grad_norm": 12.847697606224473, + "learning_rate": 9.526810643189754e-07, + "logits/chosen": 0.031186480075120926, + "logits/rejected": 0.2108938992023468, + "logps/chosen": -2.4767446517944336, + "logps/rejected": -3.136263608932495, + "loss": 0.5793, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -2.4767446517944336, + "rewards/margins": 0.659518837928772, + "rewards/rejected": -3.136263608932495, + "sft_loss": 2.5776114463806152, + "step": 1265 + }, + { + "epoch": 0.6797123264759993, + "grad_norm": 13.802014339563467, + "learning_rate": 9.52017533439079e-07, + "logits/chosen": -0.0694260522723198, + "logits/rejected": 0.029919719323515892, + "logps/chosen": -2.591400146484375, + "logps/rejected": -3.1500930786132812, + "loss": 0.6103, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -2.591400146484375, + "rewards/margins": 0.5586927533149719, + "rewards/rejected": -3.1500930786132812, + "sft_loss": 2.695070743560791, + "step": 1270 + }, + { + "epoch": 0.6823883592574009, + "grad_norm": 9.669757404786512, + "learning_rate": 9.513496170830909e-07, + "logits/chosen": -0.05950520560145378, + "logits/rejected": 0.06504709273576736, + "logps/chosen": -2.6838889122009277, + "logps/rejected": -3.1975209712982178, + "loss": 0.6543, + "rewards/accuracies": 0.6187499761581421, + "rewards/chosen": -2.6838889122009277, + "rewards/margins": 0.51363205909729, + "rewards/rejected": -3.1975209712982178, + "sft_loss": 2.694319725036621, + "step": 1275 + }, + { + "epoch": 0.6850643920388024, + "grad_norm": 10.902718843836002, + "learning_rate": 9.506773217311382e-07, + "logits/chosen": -0.05056373029947281, + "logits/rejected": 0.13073304295539856, + "logps/chosen": -2.54476261138916, + "logps/rejected": -3.0694384574890137, + "loss": 0.622, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -2.54476261138916, + "rewards/margins": 0.5246758460998535, + "rewards/rejected": -3.0694384574890137, + "sft_loss": 2.6479544639587402, + "step": 1280 + }, + { + "epoch": 0.687740424820204, + "grad_norm": 10.933403727554007, + "learning_rate": 9.500006539058334e-07, + "logits/chosen": -0.0024568967055529356, + "logits/rejected": 0.16251808404922485, + "logps/chosen": -2.3788487911224365, + "logps/rejected": -2.8265020847320557, + "loss": 0.6192, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -2.3788487911224365, + "rewards/margins": 0.44765329360961914, + "rewards/rejected": -2.8265020847320557, + "sft_loss": 2.450951099395752, + "step": 1285 + }, + { + "epoch": 0.6904164576016056, + "grad_norm": 11.691876400461265, + "learning_rate": 9.493196201722109e-07, + "logits/chosen": -0.14926186203956604, + "logits/rejected": 0.02142159640789032, + "logps/chosen": -2.5109152793884277, + "logps/rejected": -2.903718948364258, + "loss": 0.6638, + "rewards/accuracies": 0.625, + "rewards/chosen": -2.5109152793884277, + "rewards/margins": 0.39280396699905396, + "rewards/rejected": -2.903718948364258, + "sft_loss": 2.608675479888916, + "step": 1290 + }, + { + "epoch": 0.6930924903830072, + "grad_norm": 7.651439069393412, + "learning_rate": 9.486342271376628e-07, + "logits/chosen": -0.005381585098803043, + "logits/rejected": -0.002708807587623596, + "logps/chosen": -2.4760279655456543, + "logps/rejected": -3.0964133739471436, + "loss": 0.5893, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -2.4760279655456543, + "rewards/margins": 0.6203856468200684, + "rewards/rejected": -3.0964133739471436, + "sft_loss": 2.5709683895111084, + "step": 1295 + }, + { + "epoch": 0.6957685231644087, + "grad_norm": 11.078381942425255, + "learning_rate": 9.479444814518755e-07, + "logits/chosen": -0.041806433349847794, + "logits/rejected": 0.25304484367370605, + "logps/chosen": -2.383418321609497, + "logps/rejected": -3.0692639350891113, + "loss": 0.5741, + "rewards/accuracies": 0.6875, + "rewards/chosen": -2.383418321609497, + "rewards/margins": 0.6858457326889038, + "rewards/rejected": -3.0692639350891113, + "sft_loss": 2.5278241634368896, + "step": 1300 + }, + { + "epoch": 0.6984445559458103, + "grad_norm": 8.606399599948285, + "learning_rate": 9.472503898067645e-07, + "logits/chosen": 0.09567885100841522, + "logits/rejected": 0.15037801861763, + "logps/chosen": -2.4444668292999268, + "logps/rejected": -2.9069840908050537, + "loss": 0.6422, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -2.4444668292999268, + "rewards/margins": 0.4625173509120941, + "rewards/rejected": -2.9069840908050537, + "sft_loss": 2.4670987129211426, + "step": 1305 + }, + { + "epoch": 0.701120588727212, + "grad_norm": 9.235939474310552, + "learning_rate": 9.465519589364099e-07, + "logits/chosen": 0.08462213724851608, + "logits/rejected": 0.1842813789844513, + "logps/chosen": -2.3209452629089355, + "logps/rejected": -2.9042229652404785, + "loss": 0.5986, + "rewards/accuracies": 0.6875, + "rewards/chosen": -2.3209452629089355, + "rewards/margins": 0.583277702331543, + "rewards/rejected": -2.9042229652404785, + "sft_loss": 2.447727680206299, + "step": 1310 + }, + { + "epoch": 0.7037966215086134, + "grad_norm": 11.057609220966931, + "learning_rate": 9.458491956169914e-07, + "logits/chosen": -0.0232564527541399, + "logits/rejected": 0.1688549518585205, + "logps/chosen": -2.376094341278076, + "logps/rejected": -2.959827423095703, + "loss": 0.5972, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -2.376094341278076, + "rewards/margins": 0.5837332010269165, + "rewards/rejected": -2.959827423095703, + "sft_loss": 2.442779302597046, + "step": 1315 + }, + { + "epoch": 0.706472654290015, + "grad_norm": 8.547817393114872, + "learning_rate": 9.451421066667215e-07, + "logits/chosen": -0.14264704287052155, + "logits/rejected": 0.07813362777233124, + "logps/chosen": -2.3628363609313965, + "logps/rejected": -2.8726606369018555, + "loss": 0.6029, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -2.3628363609313965, + "rewards/margins": 0.5098242163658142, + "rewards/rejected": -2.8726606369018555, + "sft_loss": 2.477353572845459, + "step": 1320 + }, + { + "epoch": 0.7091486870714167, + "grad_norm": 15.063495170861021, + "learning_rate": 9.444306989457805e-07, + "logits/chosen": 0.05425446107983589, + "logits/rejected": 0.19093184173107147, + "logps/chosen": -2.3016774654388428, + "logps/rejected": -2.8103010654449463, + "loss": 0.6641, + "rewards/accuracies": 0.625, + "rewards/chosen": -2.3016774654388428, + "rewards/margins": 0.5086237192153931, + "rewards/rejected": -2.8103010654449463, + "sft_loss": 2.3117482662200928, + "step": 1325 + }, + { + "epoch": 0.7118247198528181, + "grad_norm": 11.299849791199572, + "learning_rate": 9.437149793562489e-07, + "logits/chosen": 0.008106740191578865, + "logits/rejected": 0.13419048488140106, + "logps/chosen": -2.317131280899048, + "logps/rejected": -2.8257088661193848, + "loss": 0.6247, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -2.317131280899048, + "rewards/margins": 0.5085776448249817, + "rewards/rejected": -2.8257088661193848, + "sft_loss": 2.4108710289001465, + "step": 1330 + }, + { + "epoch": 0.7145007526342197, + "grad_norm": 11.67637808191526, + "learning_rate": 9.429949548420417e-07, + "logits/chosen": 0.0023697116412222385, + "logits/rejected": 0.10824587196111679, + "logps/chosen": -2.3033547401428223, + "logps/rejected": -2.787705421447754, + "loss": 0.6166, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -2.3033547401428223, + "rewards/margins": 0.4843505918979645, + "rewards/rejected": -2.787705421447754, + "sft_loss": 2.342329502105713, + "step": 1335 + }, + { + "epoch": 0.7171767854156214, + "grad_norm": 12.35202949763023, + "learning_rate": 9.422706323888396e-07, + "logits/chosen": 0.02033412829041481, + "logits/rejected": 0.07009680569171906, + "logps/chosen": -2.1604971885681152, + "logps/rejected": -2.5448131561279297, + "loss": 0.6474, + "rewards/accuracies": 0.6875, + "rewards/chosen": -2.1604971885681152, + "rewards/margins": 0.38431602716445923, + "rewards/rejected": -2.5448131561279297, + "sft_loss": 2.2100844383239746, + "step": 1340 + }, + { + "epoch": 0.719852818197023, + "grad_norm": 8.235804001921842, + "learning_rate": 9.415420190240225e-07, + "logits/chosen": 0.06834589689970016, + "logits/rejected": 0.2915201485157013, + "logps/chosen": -2.198119640350342, + "logps/rejected": -2.784534215927124, + "loss": 0.5475, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -2.198119640350342, + "rewards/margins": 0.586414635181427, + "rewards/rejected": -2.784534215927124, + "sft_loss": 2.3332037925720215, + "step": 1345 + }, + { + "epoch": 0.7225288509784245, + "grad_norm": 12.578137433589989, + "learning_rate": 9.408091218166002e-07, + "logits/chosen": 0.04810570925474167, + "logits/rejected": 0.1264292299747467, + "logps/chosen": -2.316530227661133, + "logps/rejected": -2.6347362995147705, + "loss": 0.6768, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -2.316530227661133, + "rewards/margins": 0.31820613145828247, + "rewards/rejected": -2.6347362995147705, + "sft_loss": 2.4868788719177246, + "step": 1350 + }, + { + "epoch": 0.7252048837598261, + "grad_norm": 9.732474560078147, + "learning_rate": 9.400719478771449e-07, + "logits/chosen": -0.0023418336641043425, + "logits/rejected": 0.32972416281700134, + "logps/chosen": -2.483259916305542, + "logps/rejected": -2.9924492835998535, + "loss": 0.6039, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -2.483259916305542, + "rewards/margins": 0.5091896057128906, + "rewards/rejected": -2.9924492835998535, + "sft_loss": 2.556227445602417, + "step": 1355 + }, + { + "epoch": 0.7278809165412277, + "grad_norm": 10.784811466778459, + "learning_rate": 9.393305043577209e-07, + "logits/chosen": -0.07075022161006927, + "logits/rejected": 0.07518994808197021, + "logps/chosen": -2.6700491905212402, + "logps/rejected": -3.307894229888916, + "loss": 0.5849, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -2.6700491905212402, + "rewards/margins": 0.6378449201583862, + "rewards/rejected": -3.307894229888916, + "sft_loss": 2.8535122871398926, + "step": 1360 + }, + { + "epoch": 0.7305569493226292, + "grad_norm": 8.152593413202151, + "learning_rate": 9.38584798451817e-07, + "logits/chosen": 0.001338806701824069, + "logits/rejected": 0.16451290249824524, + "logps/chosen": -2.5478076934814453, + "logps/rejected": -3.1517438888549805, + "loss": 0.5846, + "rewards/accuracies": 0.6875, + "rewards/chosen": -2.5478076934814453, + "rewards/margins": 0.6039361953735352, + "rewards/rejected": -3.1517438888549805, + "sft_loss": 2.6479318141937256, + "step": 1365 + }, + { + "epoch": 0.7332329821040308, + "grad_norm": 44.588830934101054, + "learning_rate": 9.37834837394275e-07, + "logits/chosen": 0.01660953089594841, + "logits/rejected": 0.1651841104030609, + "logps/chosen": -2.634829521179199, + "logps/rejected": -3.4330337047576904, + "loss": 0.5846, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": -2.634829521179199, + "rewards/margins": 0.7982040643692017, + "rewards/rejected": -3.4330337047576904, + "sft_loss": 2.690223455429077, + "step": 1370 + }, + { + "epoch": 0.7359090148854324, + "grad_norm": 7.931378725379154, + "learning_rate": 9.370806284612203e-07, + "logits/chosen": -0.03525816649198532, + "logits/rejected": 0.1411258727312088, + "logps/chosen": -2.5949692726135254, + "logps/rejected": -3.309791088104248, + "loss": 0.5696, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -2.5949692726135254, + "rewards/margins": 0.7148216962814331, + "rewards/rejected": -3.309791088104248, + "sft_loss": 2.707218885421753, + "step": 1375 + }, + { + "epoch": 0.738585047666834, + "grad_norm": 8.866169013098796, + "learning_rate": 9.363221789699912e-07, + "logits/chosen": -0.07670523971319199, + "logits/rejected": 0.08060277998447418, + "logps/chosen": -2.572488784790039, + "logps/rejected": -3.0087974071502686, + "loss": 0.6832, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -2.572488784790039, + "rewards/margins": 0.43630871176719666, + "rewards/rejected": -3.0087974071502686, + "sft_loss": 2.633976459503174, + "step": 1380 + }, + { + "epoch": 0.7412610804482355, + "grad_norm": 19.933929857933897, + "learning_rate": 9.355594962790682e-07, + "logits/chosen": -0.03613440319895744, + "logits/rejected": 0.12450988590717316, + "logps/chosen": -2.275702476501465, + "logps/rejected": -2.871594190597534, + "loss": 0.6035, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -2.275702476501465, + "rewards/margins": 0.5958917737007141, + "rewards/rejected": -2.871594190597534, + "sft_loss": 2.397634983062744, + "step": 1385 + }, + { + "epoch": 0.7439371132296371, + "grad_norm": 9.498790915187232, + "learning_rate": 9.34792587788002e-07, + "logits/chosen": 0.04346933960914612, + "logits/rejected": 0.19381602108478546, + "logps/chosen": -2.349087715148926, + "logps/rejected": -2.825218677520752, + "loss": 0.6281, + "rewards/accuracies": 0.625, + "rewards/chosen": -2.349087715148926, + "rewards/margins": 0.47613126039505005, + "rewards/rejected": -2.825218677520752, + "sft_loss": 2.442924737930298, + "step": 1390 + }, + { + "epoch": 0.7466131460110387, + "grad_norm": 8.605497264714288, + "learning_rate": 9.34021460937342e-07, + "logits/chosen": 0.06417609751224518, + "logits/rejected": 0.16199856996536255, + "logps/chosen": -2.3879554271698, + "logps/rejected": -2.80842924118042, + "loss": 0.6328, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -2.3879554271698, + "rewards/margins": 0.4204736649990082, + "rewards/rejected": -2.80842924118042, + "sft_loss": 2.4789719581604004, + "step": 1395 + }, + { + "epoch": 0.7492891787924402, + "grad_norm": 8.410478037971695, + "learning_rate": 9.332461232085646e-07, + "logits/chosen": -0.16510483622550964, + "logits/rejected": 0.0025481381453573704, + "logps/chosen": -2.527270793914795, + "logps/rejected": -2.992276668548584, + "loss": 0.6225, + "rewards/accuracies": 0.6187499761581421, + "rewards/chosen": -2.527270793914795, + "rewards/margins": 0.46500545740127563, + "rewards/rejected": -2.992276668548584, + "sft_loss": 2.6221566200256348, + "step": 1400 + }, + { + "epoch": 0.7519652115738418, + "grad_norm": 10.206815132123339, + "learning_rate": 9.324665821239998e-07, + "logits/chosen": -0.04792945086956024, + "logits/rejected": 0.17230018973350525, + "logps/chosen": -2.290905714035034, + "logps/rejected": -2.963562250137329, + "loss": 0.6188, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -2.290905714035034, + "rewards/margins": 0.6726564168930054, + "rewards/rejected": -2.963562250137329, + "sft_loss": 2.4523534774780273, + "step": 1405 + }, + { + "epoch": 0.7546412443552434, + "grad_norm": 12.703305381633276, + "learning_rate": 9.316828452467583e-07, + "logits/chosen": -0.04610484838485718, + "logits/rejected": 0.16724538803100586, + "logps/chosen": -2.3713042736053467, + "logps/rejected": -2.9829983711242676, + "loss": 0.5651, + "rewards/accuracies": 0.71875, + "rewards/chosen": -2.3713042736053467, + "rewards/margins": 0.6116942167282104, + "rewards/rejected": -2.9829983711242676, + "sft_loss": 2.5226478576660156, + "step": 1410 + }, + { + "epoch": 0.7573172771366449, + "grad_norm": 13.513252526025813, + "learning_rate": 9.30894920180659e-07, + "logits/chosen": 0.08314958959817886, + "logits/rejected": 0.2487107217311859, + "logps/chosen": -2.300837278366089, + "logps/rejected": -2.700249671936035, + "loss": 0.638, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -2.300837278366089, + "rewards/margins": 0.39941245317459106, + "rewards/rejected": -2.700249671936035, + "sft_loss": 2.325178623199463, + "step": 1415 + }, + { + "epoch": 0.7599933099180465, + "grad_norm": 7.612690497794865, + "learning_rate": 9.301028145701543e-07, + "logits/chosen": 0.07180485874414444, + "logits/rejected": 0.24491927027702332, + "logps/chosen": -2.378814220428467, + "logps/rejected": -3.0824999809265137, + "loss": 0.5982, + "rewards/accuracies": 0.65625, + "rewards/chosen": -2.378814220428467, + "rewards/margins": 0.7036858797073364, + "rewards/rejected": -3.0824999809265137, + "sft_loss": 2.5600998401641846, + "step": 1420 + }, + { + "epoch": 0.7626693426994481, + "grad_norm": 8.394035519991178, + "learning_rate": 9.293065361002563e-07, + "logits/chosen": 0.07057957351207733, + "logits/rejected": 0.2046077698469162, + "logps/chosen": -2.426069736480713, + "logps/rejected": -3.0615506172180176, + "loss": 0.598, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -2.426069736480713, + "rewards/margins": 0.6354812383651733, + "rewards/rejected": -3.0615506172180176, + "sft_loss": 2.4885902404785156, + "step": 1425 + }, + { + "epoch": 0.7653453754808497, + "grad_norm": 13.024495600475117, + "learning_rate": 9.285060924964622e-07, + "logits/chosen": -0.04211791604757309, + "logits/rejected": 0.11966993659734726, + "logps/chosen": -2.557664632797241, + "logps/rejected": -3.0726795196533203, + "loss": 0.6104, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -2.557664632797241, + "rewards/margins": 0.5150147676467896, + "rewards/rejected": -3.0726795196533203, + "sft_loss": 2.638887882232666, + "step": 1430 + }, + { + "epoch": 0.7680214082622512, + "grad_norm": 11.315382949699499, + "learning_rate": 9.277014915246792e-07, + "logits/chosen": 0.10380347073078156, + "logits/rejected": 0.17050564289093018, + "logps/chosen": -2.4016692638397217, + "logps/rejected": -3.118354082107544, + "loss": 0.575, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -2.4016692638397217, + "rewards/margins": 0.7166846990585327, + "rewards/rejected": -3.118354082107544, + "sft_loss": 2.5354397296905518, + "step": 1435 + }, + { + "epoch": 0.7706974410436528, + "grad_norm": 7.414405749714338, + "learning_rate": 9.268927409911498e-07, + "logits/chosen": 0.020688209682703018, + "logits/rejected": 0.15002763271331787, + "logps/chosen": -2.38792085647583, + "logps/rejected": -2.8667197227478027, + "loss": 0.6295, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -2.38792085647583, + "rewards/margins": 0.47879910469055176, + "rewards/rejected": -2.8667197227478027, + "sft_loss": 2.567570924758911, + "step": 1440 + }, + { + "epoch": 0.7733734738250544, + "grad_norm": 13.793743690118733, + "learning_rate": 9.260798487423749e-07, + "logits/chosen": -0.0633191242814064, + "logits/rejected": 0.18369950354099274, + "logps/chosen": -2.3912315368652344, + "logps/rejected": -2.8711366653442383, + "loss": 0.6286, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -2.3912315368652344, + "rewards/margins": 0.4799051284790039, + "rewards/rejected": -2.8711366653442383, + "sft_loss": 2.5537047386169434, + "step": 1445 + }, + { + "epoch": 0.7760495066064559, + "grad_norm": 13.521948035260047, + "learning_rate": 9.252628226650389e-07, + "logits/chosen": 0.04702399671077728, + "logits/rejected": 0.16084954142570496, + "logps/chosen": -2.424561023712158, + "logps/rejected": -2.8729541301727295, + "loss": 0.671, + "rewards/accuracies": 0.65625, + "rewards/chosen": -2.424561023712158, + "rewards/margins": 0.4483933448791504, + "rewards/rejected": -2.8729541301727295, + "sft_loss": 2.5094757080078125, + "step": 1450 + }, + { + "epoch": 0.7787255393878575, + "grad_norm": 11.039814398221159, + "learning_rate": 9.244416706859321e-07, + "logits/chosen": -0.022870201617479324, + "logits/rejected": 0.1717710942029953, + "logps/chosen": -2.3271360397338867, + "logps/rejected": -2.869342803955078, + "loss": 0.6107, + "rewards/accuracies": 0.6875, + "rewards/chosen": -2.3271360397338867, + "rewards/margins": 0.542206883430481, + "rewards/rejected": -2.869342803955078, + "sft_loss": 2.42985463142395, + "step": 1455 + }, + { + "epoch": 0.7814015721692591, + "grad_norm": 7.698204509511495, + "learning_rate": 9.23616400771875e-07, + "logits/chosen": 0.023630857467651367, + "logits/rejected": 0.23449544608592987, + "logps/chosen": -2.3770511150360107, + "logps/rejected": -2.9637835025787354, + "loss": 0.5966, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -2.3770511150360107, + "rewards/margins": 0.5867326855659485, + "rewards/rejected": -2.9637835025787354, + "sft_loss": 2.427887201309204, + "step": 1460 + }, + { + "epoch": 0.7840776049506607, + "grad_norm": 8.187687061644198, + "learning_rate": 9.227870209296395e-07, + "logits/chosen": 0.016844961792230606, + "logits/rejected": 0.16727712750434875, + "logps/chosen": -2.491568088531494, + "logps/rejected": -2.9410855770111084, + "loss": 0.6333, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -2.491568088531494, + "rewards/margins": 0.4495179057121277, + "rewards/rejected": -2.9410855770111084, + "sft_loss": 2.5922093391418457, + "step": 1465 + }, + { + "epoch": 0.7867536377320622, + "grad_norm": 10.159632993115585, + "learning_rate": 9.219535392058728e-07, + "logits/chosen": -0.017890067771077156, + "logits/rejected": 0.014517772011458874, + "logps/chosen": -2.4694104194641113, + "logps/rejected": -2.9818406105041504, + "loss": 0.627, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -2.4694104194641113, + "rewards/margins": 0.5124302506446838, + "rewards/rejected": -2.9818406105041504, + "sft_loss": 2.578446626663208, + "step": 1470 + }, + { + "epoch": 0.7894296705134638, + "grad_norm": 9.98388075126376, + "learning_rate": 9.211159636870181e-07, + "logits/chosen": -0.051668472588062286, + "logits/rejected": 0.15935666859149933, + "logps/chosen": -2.43375301361084, + "logps/rejected": -3.010441541671753, + "loss": 0.6019, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -2.43375301361084, + "rewards/margins": 0.5766886472702026, + "rewards/rejected": -3.010441541671753, + "sft_loss": 2.519343137741089, + "step": 1475 + }, + { + "epoch": 0.7921057032948654, + "grad_norm": 9.826631961075899, + "learning_rate": 9.202743024992367e-07, + "logits/chosen": 0.06925741583108902, + "logits/rejected": 0.19104215502738953, + "logps/chosen": -2.2985386848449707, + "logps/rejected": -2.9673094749450684, + "loss": 0.5893, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -2.2985386848449707, + "rewards/margins": 0.6687710881233215, + "rewards/rejected": -2.9673094749450684, + "sft_loss": 2.3903632164001465, + "step": 1480 + }, + { + "epoch": 0.7947817360762669, + "grad_norm": 14.190865641317497, + "learning_rate": 9.194285638083293e-07, + "logits/chosen": 0.054853539913892746, + "logits/rejected": 0.24246785044670105, + "logps/chosen": -2.4935526847839355, + "logps/rejected": -3.1982533931732178, + "loss": 0.5546, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -2.4935526847839355, + "rewards/margins": 0.7047004699707031, + "rewards/rejected": -3.1982533931732178, + "sft_loss": 2.608858585357666, + "step": 1485 + }, + { + "epoch": 0.7974577688576685, + "grad_norm": 12.488609995267533, + "learning_rate": 9.185787558196562e-07, + "logits/chosen": -0.029964571818709373, + "logits/rejected": 0.11130587756633759, + "logps/chosen": -2.4313859939575195, + "logps/rejected": -2.966317892074585, + "loss": 0.6163, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -2.4313859939575195, + "rewards/margins": 0.534932017326355, + "rewards/rejected": -2.966317892074585, + "sft_loss": 2.5397000312805176, + "step": 1490 + }, + { + "epoch": 0.8001338016390701, + "grad_norm": 10.465880185412507, + "learning_rate": 9.177248867780583e-07, + "logits/chosen": 0.029187191277742386, + "logits/rejected": 0.1466110199689865, + "logps/chosen": -2.6569457054138184, + "logps/rejected": -3.035051107406616, + "loss": 0.6588, + "rewards/accuracies": 0.606249988079071, + "rewards/chosen": -2.6569457054138184, + "rewards/margins": 0.3781054615974426, + "rewards/rejected": -3.035051107406616, + "sft_loss": 2.8448221683502197, + "step": 1495 + }, + { + "epoch": 0.8028098344204716, + "grad_norm": 10.168222832018843, + "learning_rate": 9.168669649677769e-07, + "logits/chosen": -0.03170633316040039, + "logits/rejected": 0.09394307434558868, + "logps/chosen": -2.513032913208008, + "logps/rejected": -3.018311023712158, + "loss": 0.6592, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -2.513032913208008, + "rewards/margins": 0.5052781105041504, + "rewards/rejected": -3.018311023712158, + "sft_loss": 2.691277027130127, + "step": 1500 + }, + { + "epoch": 0.8054858672018732, + "grad_norm": 10.59003170893267, + "learning_rate": 9.16004998712373e-07, + "logits/chosen": 0.04619743674993515, + "logits/rejected": 0.13593070209026337, + "logps/chosen": -2.49043607711792, + "logps/rejected": -3.106337785720825, + "loss": 0.5759, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -2.49043607711792, + "rewards/margins": 0.6159020662307739, + "rewards/rejected": -3.106337785720825, + "sft_loss": 2.5797295570373535, + "step": 1505 + }, + { + "epoch": 0.8081618999832748, + "grad_norm": 6.3951303624507565, + "learning_rate": 9.151389963746472e-07, + "logits/chosen": -0.03683237358927727, + "logits/rejected": 0.29354020953178406, + "logps/chosen": -2.4399712085723877, + "logps/rejected": -3.1233441829681396, + "loss": 0.548, + "rewards/accuracies": 0.7562500238418579, + "rewards/chosen": -2.4399712085723877, + "rewards/margins": 0.6833727359771729, + "rewards/rejected": -3.1233441829681396, + "sft_loss": 2.5200114250183105, + "step": 1510 + }, + { + "epoch": 0.8108379327646764, + "grad_norm": 8.83858247981277, + "learning_rate": 9.142689663565577e-07, + "logits/chosen": 0.06057562306523323, + "logits/rejected": 0.1377892941236496, + "logps/chosen": -2.4171109199523926, + "logps/rejected": -3.0059268474578857, + "loss": 0.5867, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -2.4171109199523926, + "rewards/margins": 0.5888158082962036, + "rewards/rejected": -3.0059268474578857, + "sft_loss": 2.5347301959991455, + "step": 1515 + }, + { + "epoch": 0.8135139655460779, + "grad_norm": 10.59050926766106, + "learning_rate": 9.133949170991397e-07, + "logits/chosen": 0.021306898444890976, + "logits/rejected": 0.13196924328804016, + "logps/chosen": -2.4130589962005615, + "logps/rejected": -3.0544638633728027, + "loss": 0.5634, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -2.4130589962005615, + "rewards/margins": 0.6414052248001099, + "rewards/rejected": -3.0544638633728027, + "sft_loss": 2.662980794906616, + "step": 1520 + }, + { + "epoch": 0.8161899983274795, + "grad_norm": 9.759469171962795, + "learning_rate": 9.125168570824231e-07, + "logits/chosen": -0.017821846529841423, + "logits/rejected": 0.1999787837266922, + "logps/chosen": -2.5519795417785645, + "logps/rejected": -3.0707650184631348, + "loss": 0.6221, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -2.5519795417785645, + "rewards/margins": 0.5187851190567017, + "rewards/rejected": -3.0707650184631348, + "sft_loss": 2.6400365829467773, + "step": 1525 + }, + { + "epoch": 0.8188660311088811, + "grad_norm": 10.399775137815965, + "learning_rate": 9.116347948253496e-07, + "logits/chosen": -0.026381874457001686, + "logits/rejected": 0.1348962038755417, + "logps/chosen": -2.6443264484405518, + "logps/rejected": -3.1459546089172363, + "loss": 0.6222, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -2.6443264484405518, + "rewards/margins": 0.5016285181045532, + "rewards/rejected": -3.1459546089172363, + "sft_loss": 2.7468667030334473, + "step": 1530 + }, + { + "epoch": 0.8215420638902826, + "grad_norm": 10.118881166792805, + "learning_rate": 9.107487388856916e-07, + "logits/chosen": -0.0058399587869644165, + "logits/rejected": 0.21891054511070251, + "logps/chosen": -2.5201256275177, + "logps/rejected": -3.0925190448760986, + "loss": 0.5805, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": -2.5201256275177, + "rewards/margins": 0.5723938345909119, + "rewards/rejected": -3.0925190448760986, + "sft_loss": 2.6686787605285645, + "step": 1535 + }, + { + "epoch": 0.8242180966716842, + "grad_norm": 14.069021014111197, + "learning_rate": 9.098586978599673e-07, + "logits/chosen": 0.05855223536491394, + "logits/rejected": 0.25015169382095337, + "logps/chosen": -2.5722360610961914, + "logps/rejected": -3.3856213092803955, + "loss": 0.5534, + "rewards/accuracies": 0.71875, + "rewards/chosen": -2.5722360610961914, + "rewards/margins": 0.8133853077888489, + "rewards/rejected": -3.3856213092803955, + "sft_loss": 2.6835830211639404, + "step": 1540 + }, + { + "epoch": 0.8268941294530858, + "grad_norm": 8.476476232068334, + "learning_rate": 9.089646803833588e-07, + "logits/chosen": 0.06817345321178436, + "logits/rejected": 0.2557623088359833, + "logps/chosen": -2.548581838607788, + "logps/rejected": -3.107095241546631, + "loss": 0.6084, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -2.548581838607788, + "rewards/margins": 0.558513343334198, + "rewards/rejected": -3.107095241546631, + "sft_loss": 2.7562756538391113, + "step": 1545 + }, + { + "epoch": 0.8295701622344873, + "grad_norm": 10.063918914577242, + "learning_rate": 9.080666951296276e-07, + "logits/chosen": -0.09717416018247604, + "logits/rejected": 0.22753658890724182, + "logps/chosen": -2.5381972789764404, + "logps/rejected": -3.340608596801758, + "loss": 0.5353, + "rewards/accuracies": 0.768750011920929, + "rewards/chosen": -2.5381972789764404, + "rewards/margins": 0.8024111986160278, + "rewards/rejected": -3.340608596801758, + "sft_loss": 2.6446547508239746, + "step": 1550 + }, + { + "epoch": 0.8322461950158889, + "grad_norm": 10.929038771965324, + "learning_rate": 9.071647508110305e-07, + "logits/chosen": -0.09855834394693375, + "logits/rejected": 0.22255852818489075, + "logps/chosen": -2.6067984104156494, + "logps/rejected": -3.4245991706848145, + "loss": 0.5868, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -2.6067984104156494, + "rewards/margins": 0.8178008198738098, + "rewards/rejected": -3.4245991706848145, + "sft_loss": 2.7024035453796387, + "step": 1555 + }, + { + "epoch": 0.8349222277972905, + "grad_norm": 10.825492368892702, + "learning_rate": 9.062588561782354e-07, + "logits/chosen": 0.045937247574329376, + "logits/rejected": 0.1359720081090927, + "logps/chosen": -2.725828170776367, + "logps/rejected": -3.294826030731201, + "loss": 0.6275, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -2.725828170776367, + "rewards/margins": 0.5689979195594788, + "rewards/rejected": -3.294826030731201, + "sft_loss": 2.925290107727051, + "step": 1560 + }, + { + "epoch": 0.8375982605786921, + "grad_norm": 8.38173658353747, + "learning_rate": 9.053490200202358e-07, + "logits/chosen": 0.056266169995069504, + "logits/rejected": 0.1544162929058075, + "logps/chosen": -2.730910062789917, + "logps/rejected": -3.2816321849823, + "loss": 0.6223, + "rewards/accuracies": 0.71875, + "rewards/chosen": -2.730910062789917, + "rewards/margins": 0.5507221221923828, + "rewards/rejected": -3.2816321849823, + "sft_loss": 2.8730056285858154, + "step": 1565 + }, + { + "epoch": 0.8402742933600936, + "grad_norm": 14.039872862864177, + "learning_rate": 9.044352511642661e-07, + "logits/chosen": 0.02374919317662716, + "logits/rejected": 0.07336236536502838, + "logps/chosen": -2.6218318939208984, + "logps/rejected": -3.0729284286499023, + "loss": 0.6783, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -2.6218318939208984, + "rewards/margins": 0.45109647512435913, + "rewards/rejected": -3.0729284286499023, + "sft_loss": 2.8568921089172363, + "step": 1570 + }, + { + "epoch": 0.8429503261414952, + "grad_norm": 8.102782982998859, + "learning_rate": 9.03517558475716e-07, + "logits/chosen": 0.02059529349207878, + "logits/rejected": 0.1358802318572998, + "logps/chosen": -2.311969518661499, + "logps/rejected": -2.7871482372283936, + "loss": 0.6045, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -2.311969518661499, + "rewards/margins": 0.47517871856689453, + "rewards/rejected": -2.7871482372283936, + "sft_loss": 2.404184341430664, + "step": 1575 + }, + { + "epoch": 0.8456263589228968, + "grad_norm": 10.945963523938573, + "learning_rate": 9.025959508580436e-07, + "logits/chosen": 0.05906160920858383, + "logits/rejected": 0.3352259695529938, + "logps/chosen": -2.40116548538208, + "logps/rejected": -2.9874892234802246, + "loss": 0.586, + "rewards/accuracies": 0.71875, + "rewards/chosen": -2.40116548538208, + "rewards/margins": 0.586323618888855, + "rewards/rejected": -2.9874892234802246, + "sft_loss": 2.513141632080078, + "step": 1580 + }, + { + "epoch": 0.8483023917042983, + "grad_norm": 7.322528639219694, + "learning_rate": 9.016704372526905e-07, + "logits/chosen": 0.016174791380763054, + "logits/rejected": 0.21014456450939178, + "logps/chosen": -2.2829861640930176, + "logps/rejected": -2.883354663848877, + "loss": 0.5812, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -2.2829861640930176, + "rewards/margins": 0.6003685593605042, + "rewards/rejected": -2.883354663848877, + "sft_loss": 2.4132537841796875, + "step": 1585 + }, + { + "epoch": 0.8509784244856999, + "grad_norm": 14.393486048168747, + "learning_rate": 9.007410266389934e-07, + "logits/chosen": -0.07246458530426025, + "logits/rejected": 0.015409506857395172, + "logps/chosen": -2.3226370811462402, + "logps/rejected": -2.7810959815979004, + "loss": 0.6211, + "rewards/accuracies": 0.65625, + "rewards/chosen": -2.3226370811462402, + "rewards/margins": 0.4584590792655945, + "rewards/rejected": -2.7810959815979004, + "sft_loss": 2.4499218463897705, + "step": 1590 + }, + { + "epoch": 0.8536544572671015, + "grad_norm": 13.456133600052352, + "learning_rate": 8.998077280340981e-07, + "logits/chosen": 0.018017996102571487, + "logits/rejected": 0.09234277158975601, + "logps/chosen": -2.4913840293884277, + "logps/rejected": -2.8565590381622314, + "loss": 0.6712, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -2.4913840293884277, + "rewards/margins": 0.36517494916915894, + "rewards/rejected": -2.8565590381622314, + "sft_loss": 2.5182454586029053, + "step": 1595 + }, + { + "epoch": 0.8563304900485031, + "grad_norm": 10.74731582976374, + "learning_rate": 8.988705504928722e-07, + "logits/chosen": -0.10593881458044052, + "logits/rejected": 0.11520107090473175, + "logps/chosen": -2.420809268951416, + "logps/rejected": -3.160034418106079, + "loss": 0.5405, + "rewards/accuracies": 0.768750011920929, + "rewards/chosen": -2.420809268951416, + "rewards/margins": 0.7392248511314392, + "rewards/rejected": -3.160034418106079, + "sft_loss": 2.5499579906463623, + "step": 1600 + }, + { + "epoch": 0.8563304900485031, + "eval_logits/chosen": 0.3042474389076233, + "eval_logits/rejected": 0.41216760873794556, + "eval_logps/chosen": -2.415980100631714, + "eval_logps/rejected": -3.020190477371216, + "eval_loss": 0.578135073184967, + "eval_rewards/accuracies": 0.7091988325119019, + "eval_rewards/chosen": -2.415980100631714, + "eval_rewards/margins": 0.6042105555534363, + "eval_rewards/rejected": -3.020190477371216, + "eval_runtime": 52.1108, + "eval_samples_per_second": 25.81, + "eval_sft_loss": 2.5442447662353516, + "eval_steps_per_second": 6.467, + "step": 1600 + }, + { + "epoch": 0.8590065228299046, + "grad_norm": 10.524729461130852, + "learning_rate": 8.979295031078157e-07, + "logits/chosen": -0.09900583326816559, + "logits/rejected": 0.1608930081129074, + "logps/chosen": -2.5220437049865723, + "logps/rejected": -3.159421920776367, + "loss": 0.5715, + "rewards/accuracies": 0.71875, + "rewards/chosen": -2.5220437049865723, + "rewards/margins": 0.6373783946037292, + "rewards/rejected": -3.159421920776367, + "sft_loss": 2.5957248210906982, + "step": 1605 + }, + { + "epoch": 0.8616825556113062, + "grad_norm": 10.311884692182046, + "learning_rate": 8.969845950089751e-07, + "logits/chosen": -0.10433954000473022, + "logits/rejected": 0.09323252737522125, + "logps/chosen": -2.5022635459899902, + "logps/rejected": -3.2369015216827393, + "loss": 0.5502, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -2.5022635459899902, + "rewards/margins": 0.7346378564834595, + "rewards/rejected": -3.2369015216827393, + "sft_loss": 2.668539524078369, + "step": 1610 + }, + { + "epoch": 0.8643585883927078, + "grad_norm": 13.504489740740828, + "learning_rate": 8.960358353638526e-07, + "logits/chosen": -0.04470493271946907, + "logits/rejected": 0.09468583017587662, + "logps/chosen": -2.56691312789917, + "logps/rejected": -3.183248996734619, + "loss": 0.6348, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -2.56691312789917, + "rewards/margins": 0.6163356900215149, + "rewards/rejected": -3.183248996734619, + "sft_loss": 2.6765799522399902, + "step": 1615 + }, + { + "epoch": 0.8670346211741093, + "grad_norm": 9.87102588828472, + "learning_rate": 8.950832333773184e-07, + "logits/chosen": -0.029353905469179153, + "logits/rejected": 0.15252381563186646, + "logps/chosen": -2.49654483795166, + "logps/rejected": -3.065964937210083, + "loss": 0.634, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -2.49654483795166, + "rewards/margins": 0.5694200992584229, + "rewards/rejected": -3.065964937210083, + "sft_loss": 2.660219430923462, + "step": 1620 + }, + { + "epoch": 0.869710653955511, + "grad_norm": 11.89297302689208, + "learning_rate": 8.941267982915213e-07, + "logits/chosen": 0.02455337718129158, + "logits/rejected": 0.08116547763347626, + "logps/chosen": -2.5555217266082764, + "logps/rejected": -2.864656686782837, + "loss": 0.7054, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -2.5555217266082764, + "rewards/margins": 0.30913475155830383, + "rewards/rejected": -2.864656686782837, + "sft_loss": 2.5726044178009033, + "step": 1625 + }, + { + "epoch": 0.8723866867369126, + "grad_norm": 8.067304083731935, + "learning_rate": 8.931665393857983e-07, + "logits/chosen": -0.003184753004461527, + "logits/rejected": 0.15830065310001373, + "logps/chosen": -2.301793336868286, + "logps/rejected": -2.8337371349334717, + "loss": 0.5993, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -2.301793336868286, + "rewards/margins": 0.5319440364837646, + "rewards/rejected": -2.8337371349334717, + "sft_loss": 2.388474225997925, + "step": 1630 + }, + { + "epoch": 0.875062719518314, + "grad_norm": 7.820656606757321, + "learning_rate": 8.922024659765861e-07, + "logits/chosen": -0.11717693507671356, + "logits/rejected": 0.031198328360915184, + "logps/chosen": -2.133420467376709, + "logps/rejected": -2.7510008811950684, + "loss": 0.5645, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -2.133420467376709, + "rewards/margins": 0.6175805330276489, + "rewards/rejected": -2.7510008811950684, + "sft_loss": 2.2334976196289062, + "step": 1635 + }, + { + "epoch": 0.8777387522997157, + "grad_norm": 8.589411461863673, + "learning_rate": 8.912345874173288e-07, + "logits/chosen": -0.07331643998622894, + "logits/rejected": 0.06835556030273438, + "logps/chosen": -2.201294183731079, + "logps/rejected": -2.838127374649048, + "loss": 0.5824, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -2.201294183731079, + "rewards/margins": 0.6368333101272583, + "rewards/rejected": -2.838127374649048, + "sft_loss": 2.3204426765441895, + "step": 1640 + }, + { + "epoch": 0.8804147850811173, + "grad_norm": 9.556488632247595, + "learning_rate": 8.902629130983885e-07, + "logits/chosen": -0.008688343688845634, + "logits/rejected": 0.0492224246263504, + "logps/chosen": -2.273496389389038, + "logps/rejected": -2.669645309448242, + "loss": 0.6292, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -2.273496389389038, + "rewards/margins": 0.3961489200592041, + "rewards/rejected": -2.669645309448242, + "sft_loss": 2.380650281906128, + "step": 1645 + }, + { + "epoch": 0.8830908178625189, + "grad_norm": 11.532073827027904, + "learning_rate": 8.892874524469537e-07, + "logits/chosen": 0.0749739557504654, + "logits/rejected": 0.14436577260494232, + "logps/chosen": -2.218055248260498, + "logps/rejected": -2.7674765586853027, + "loss": 0.5729, + "rewards/accuracies": 0.71875, + "rewards/chosen": -2.218055248260498, + "rewards/margins": 0.5494211912155151, + "rewards/rejected": -2.7674765586853027, + "sft_loss": 2.2503247261047363, + "step": 1650 + }, + { + "epoch": 0.8857668506439204, + "grad_norm": 11.091732222379273, + "learning_rate": 8.883082149269478e-07, + "logits/chosen": -0.04484002664685249, + "logits/rejected": 0.0897749662399292, + "logps/chosen": -2.3275647163391113, + "logps/rejected": -2.8501715660095215, + "loss": 0.5955, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -2.3275647163391113, + "rewards/margins": 0.5226072072982788, + "rewards/rejected": -2.8501715660095215, + "sft_loss": 2.369812488555908, + "step": 1655 + }, + { + "epoch": 0.888442883425322, + "grad_norm": 9.42136788562333, + "learning_rate": 8.873252100389377e-07, + "logits/chosen": -0.004197058267891407, + "logits/rejected": 0.049018494784832, + "logps/chosen": -2.1921660900115967, + "logps/rejected": -2.7342629432678223, + "loss": 0.5732, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -2.1921660900115967, + "rewards/margins": 0.5420966744422913, + "rewards/rejected": -2.7342629432678223, + "sft_loss": 2.228539228439331, + "step": 1660 + }, + { + "epoch": 0.8911189162067236, + "grad_norm": 11.62999518562216, + "learning_rate": 8.863384473200411e-07, + "logits/chosen": 0.006611555814743042, + "logits/rejected": 0.07799387723207474, + "logps/chosen": -2.5068271160125732, + "logps/rejected": -2.906339168548584, + "loss": 0.6524, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -2.5068271160125732, + "rewards/margins": 0.3995123505592346, + "rewards/rejected": -2.906339168548584, + "sft_loss": 2.572601795196533, + "step": 1665 + }, + { + "epoch": 0.8937949489881251, + "grad_norm": 9.897573518256243, + "learning_rate": 8.853479363438342e-07, + "logits/chosen": 0.030647706240415573, + "logits/rejected": 0.21590964496135712, + "logps/chosen": -2.4557127952575684, + "logps/rejected": -2.8481664657592773, + "loss": 0.6855, + "rewards/accuracies": 0.625, + "rewards/chosen": -2.4557127952575684, + "rewards/margins": 0.39245349168777466, + "rewards/rejected": -2.8481664657592773, + "sft_loss": 2.4239420890808105, + "step": 1670 + }, + { + "epoch": 0.8964709817695267, + "grad_norm": 9.069166113077122, + "learning_rate": 8.843536867202588e-07, + "logits/chosen": 0.002661994192749262, + "logits/rejected": 0.2346363514661789, + "logps/chosen": -2.4013454914093018, + "logps/rejected": -3.0166726112365723, + "loss": 0.5964, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -2.4013454914093018, + "rewards/margins": 0.6153267621994019, + "rewards/rejected": -3.0166726112365723, + "sft_loss": 2.511767864227295, + "step": 1675 + }, + { + "epoch": 0.8991470145509283, + "grad_norm": 10.264602842040526, + "learning_rate": 8.833557080955292e-07, + "logits/chosen": -0.10588578134775162, + "logits/rejected": 0.02353302203118801, + "logps/chosen": -2.397885322570801, + "logps/rejected": -2.8403050899505615, + "loss": 0.6325, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -2.397885322570801, + "rewards/margins": 0.4424198269844055, + "rewards/rejected": -2.8403050899505615, + "sft_loss": 2.5135016441345215, + "step": 1680 + }, + { + "epoch": 0.9018230473323299, + "grad_norm": 8.803254226319224, + "learning_rate": 8.823540101520381e-07, + "logits/chosen": -0.04913238063454628, + "logits/rejected": 0.19184985756874084, + "logps/chosen": -2.2383077144622803, + "logps/rejected": -2.8146414756774902, + "loss": 0.6112, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -2.2383077144622803, + "rewards/margins": 0.5763339400291443, + "rewards/rejected": -2.8146414756774902, + "sft_loss": 2.3560469150543213, + "step": 1685 + }, + { + "epoch": 0.9044990801137314, + "grad_norm": 8.00691293957899, + "learning_rate": 8.813486026082637e-07, + "logits/chosen": -0.04420467093586922, + "logits/rejected": 0.1645149141550064, + "logps/chosen": -2.1973953247070312, + "logps/rejected": -2.8400979042053223, + "loss": 0.5517, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -2.1973953247070312, + "rewards/margins": 0.6427024602890015, + "rewards/rejected": -2.8400979042053223, + "sft_loss": 2.3609063625335693, + "step": 1690 + }, + { + "epoch": 0.907175112895133, + "grad_norm": 13.186135400387222, + "learning_rate": 8.803394952186742e-07, + "logits/chosen": -0.20445886254310608, + "logits/rejected": -0.030445415526628494, + "logps/chosen": -2.309983491897583, + "logps/rejected": -2.8405935764312744, + "loss": 0.5976, + "rewards/accuracies": 0.71875, + "rewards/chosen": -2.309983491897583, + "rewards/margins": 0.5306099653244019, + "rewards/rejected": -2.8405935764312744, + "sft_loss": 2.475982189178467, + "step": 1695 + }, + { + "epoch": 0.9098511456765346, + "grad_norm": 11.275274668930086, + "learning_rate": 8.793266977736342e-07, + "logits/chosen": 0.0015526011120527983, + "logits/rejected": -0.050358422100543976, + "logps/chosen": -2.424626111984253, + "logps/rejected": -2.747622489929199, + "loss": 0.655, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -2.424626111984253, + "rewards/margins": 0.3229961097240448, + "rewards/rejected": -2.747622489929199, + "sft_loss": 2.554354190826416, + "step": 1700 + }, + { + "epoch": 0.9125271784579361, + "grad_norm": 13.117118001873978, + "learning_rate": 8.783102200993085e-07, + "logits/chosen": -0.037245552986860275, + "logits/rejected": 0.11655652523040771, + "logps/chosen": -2.402632236480713, + "logps/rejected": -3.000462293624878, + "loss": 0.5785, + "rewards/accuracies": 0.71875, + "rewards/chosen": -2.402632236480713, + "rewards/margins": 0.5978304743766785, + "rewards/rejected": -3.000462293624878, + "sft_loss": 2.527595043182373, + "step": 1705 + }, + { + "epoch": 0.9152032112393377, + "grad_norm": 9.46908294460337, + "learning_rate": 8.772900720575683e-07, + "logits/chosen": -0.05901443958282471, + "logits/rejected": 0.04909048229455948, + "logps/chosen": -2.5413331985473633, + "logps/rejected": -2.999572992324829, + "loss": 0.6188, + "rewards/accuracies": 0.6875, + "rewards/chosen": -2.5413331985473633, + "rewards/margins": 0.45823970437049866, + "rewards/rejected": -2.999572992324829, + "sft_loss": 2.6973278522491455, + "step": 1710 + }, + { + "epoch": 0.9178792440207393, + "grad_norm": 12.101459296969413, + "learning_rate": 8.762662635458944e-07, + "logits/chosen": -0.07247239351272583, + "logits/rejected": 0.1388850212097168, + "logps/chosen": -2.650653123855591, + "logps/rejected": -3.1614272594451904, + "loss": 0.6523, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -2.650653123855591, + "rewards/margins": 0.5107744932174683, + "rewards/rejected": -3.1614272594451904, + "sft_loss": 2.730468988418579, + "step": 1715 + }, + { + "epoch": 0.9205552768021408, + "grad_norm": 11.961155999673137, + "learning_rate": 8.752388044972811e-07, + "logits/chosen": -0.05106500908732414, + "logits/rejected": 0.0330757237970829, + "logps/chosen": -2.454437732696533, + "logps/rejected": -3.1570143699645996, + "loss": 0.565, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -2.454437732696533, + "rewards/margins": 0.7025768160820007, + "rewards/rejected": -3.1570143699645996, + "sft_loss": 2.619330644607544, + "step": 1720 + }, + { + "epoch": 0.9232313095835424, + "grad_norm": 8.617608037704702, + "learning_rate": 8.74207704880141e-07, + "logits/chosen": -0.022454053163528442, + "logits/rejected": 0.0959378033876419, + "logps/chosen": -2.598407745361328, + "logps/rejected": -3.4198384284973145, + "loss": 0.5128, + "rewards/accuracies": 0.7562500238418579, + "rewards/chosen": -2.598407745361328, + "rewards/margins": 0.8214303851127625, + "rewards/rejected": -3.4198384284973145, + "sft_loss": 2.8084254264831543, + "step": 1725 + }, + { + "epoch": 0.925907342364944, + "grad_norm": 10.519713288799583, + "learning_rate": 8.731729746982068e-07, + "logits/chosen": 0.01740475744009018, + "logits/rejected": 0.10223283618688583, + "logps/chosen": -2.5419623851776123, + "logps/rejected": -3.128298282623291, + "loss": 0.5918, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -2.5419623851776123, + "rewards/margins": 0.5863358378410339, + "rewards/rejected": -3.128298282623291, + "sft_loss": 2.7975895404815674, + "step": 1730 + }, + { + "epoch": 0.9285833751463456, + "grad_norm": 10.455643617501638, + "learning_rate": 8.721346239904355e-07, + "logits/chosen": -0.16528668999671936, + "logits/rejected": 0.045746032148599625, + "logps/chosen": -2.6919751167297363, + "logps/rejected": -3.503297805786133, + "loss": 0.5759, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -2.6919751167297363, + "rewards/margins": 0.8113226890563965, + "rewards/rejected": -3.503297805786133, + "sft_loss": 2.817317247390747, + "step": 1735 + }, + { + "epoch": 0.9312594079277471, + "grad_norm": 10.852266542046127, + "learning_rate": 8.710926628309101e-07, + "logits/chosen": -0.10590909421443939, + "logits/rejected": 0.08133234828710556, + "logps/chosen": -2.810664653778076, + "logps/rejected": -3.4224929809570312, + "loss": 0.578, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -2.810664653778076, + "rewards/margins": 0.6118277311325073, + "rewards/rejected": -3.4224929809570312, + "sft_loss": 2.9222919940948486, + "step": 1740 + }, + { + "epoch": 0.9339354407091487, + "grad_norm": 7.61006798221914, + "learning_rate": 8.700471013287424e-07, + "logits/chosen": 0.021407146006822586, + "logits/rejected": 0.06234606355428696, + "logps/chosen": -2.5792348384857178, + "logps/rejected": -3.1805102825164795, + "loss": 0.5893, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -2.5792348384857178, + "rewards/margins": 0.6012751460075378, + "rewards/rejected": -3.1805102825164795, + "sft_loss": 2.7608916759490967, + "step": 1745 + }, + { + "epoch": 0.9366114734905503, + "grad_norm": 14.860283835159844, + "learning_rate": 8.689979496279746e-07, + "logits/chosen": -0.027105966582894325, + "logits/rejected": 0.03506525978446007, + "logps/chosen": -2.8316097259521484, + "logps/rejected": -3.2912864685058594, + "loss": 0.6747, + "rewards/accuracies": 0.625, + "rewards/chosen": -2.8316097259521484, + "rewards/margins": 0.45967674255371094, + "rewards/rejected": -3.2912864685058594, + "sft_loss": 2.979743480682373, + "step": 1750 + }, + { + "epoch": 0.9392875062719518, + "grad_norm": 10.10079478152837, + "learning_rate": 8.679452179074811e-07, + "logits/chosen": -0.028563061729073524, + "logits/rejected": 0.10825793445110321, + "logps/chosen": -2.69927716255188, + "logps/rejected": -3.3682494163513184, + "loss": 0.5521, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -2.69927716255188, + "rewards/margins": 0.6689725518226624, + "rewards/rejected": -3.3682494163513184, + "sft_loss": 2.9125723838806152, + "step": 1755 + }, + { + "epoch": 0.9419635390533534, + "grad_norm": 13.235932611878656, + "learning_rate": 8.668889163808698e-07, + "logits/chosen": -0.03027569130063057, + "logits/rejected": 0.12222941219806671, + "logps/chosen": -2.6722209453582764, + "logps/rejected": -3.266571521759033, + "loss": 0.5746, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -2.6722209453582764, + "rewards/margins": 0.5943503379821777, + "rewards/rejected": -3.266571521759033, + "sft_loss": 2.8436102867126465, + "step": 1760 + }, + { + "epoch": 0.944639571834755, + "grad_norm": 11.58412404023222, + "learning_rate": 8.658290552963827e-07, + "logits/chosen": 0.03251715749502182, + "logits/rejected": 0.07301442325115204, + "logps/chosen": -2.695906162261963, + "logps/rejected": -3.2821033000946045, + "loss": 0.6197, + "rewards/accuracies": 0.6875, + "rewards/chosen": -2.695906162261963, + "rewards/margins": 0.5861972570419312, + "rewards/rejected": -3.2821033000946045, + "sft_loss": 2.8826968669891357, + "step": 1765 + }, + { + "epoch": 0.9473156046161565, + "grad_norm": 10.649089680608327, + "learning_rate": 8.647656449367966e-07, + "logits/chosen": 0.020825685933232307, + "logits/rejected": 0.19587847590446472, + "logps/chosen": -2.665592908859253, + "logps/rejected": -3.1587936878204346, + "loss": 0.6363, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -2.665592908859253, + "rewards/margins": 0.49320077896118164, + "rewards/rejected": -3.1587936878204346, + "sft_loss": 2.8860023021698, + "step": 1770 + }, + { + "epoch": 0.9499916373975581, + "grad_norm": 9.615398550070553, + "learning_rate": 8.636986956193235e-07, + "logits/chosen": -0.04956268146634102, + "logits/rejected": 0.08047482371330261, + "logps/chosen": -2.4707024097442627, + "logps/rejected": -3.0871739387512207, + "loss": 0.584, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -2.4707024097442627, + "rewards/margins": 0.6164714694023132, + "rewards/rejected": -3.0871739387512207, + "sft_loss": 2.6601004600524902, + "step": 1775 + }, + { + "epoch": 0.9526676701789597, + "grad_norm": 9.694860492684317, + "learning_rate": 8.626282176955104e-07, + "logits/chosen": -0.030627835541963577, + "logits/rejected": 0.10882334411144257, + "logps/chosen": -2.3581511974334717, + "logps/rejected": -2.9486238956451416, + "loss": 0.5797, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -2.3581511974334717, + "rewards/margins": 0.5904725193977356, + "rewards/rejected": -2.9486238956451416, + "sft_loss": 2.451658248901367, + "step": 1780 + }, + { + "epoch": 0.9553437029603613, + "grad_norm": 14.376924824437532, + "learning_rate": 8.615542215511389e-07, + "logits/chosen": 0.03919892758131027, + "logits/rejected": 0.11939878761768341, + "logps/chosen": -2.399656295776367, + "logps/rejected": -2.767712116241455, + "loss": 0.6465, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -2.399656295776367, + "rewards/margins": 0.3680557310581207, + "rewards/rejected": -2.767712116241455, + "sft_loss": 2.459564208984375, + "step": 1785 + }, + { + "epoch": 0.9580197357417628, + "grad_norm": 11.271406982128253, + "learning_rate": 8.604767176061241e-07, + "logits/chosen": 0.10714355856180191, + "logits/rejected": 0.19515064358711243, + "logps/chosen": -2.5030503273010254, + "logps/rejected": -2.955298900604248, + "loss": 0.615, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -2.5030503273010254, + "rewards/margins": 0.4522481858730316, + "rewards/rejected": -2.955298900604248, + "sft_loss": 2.6175243854522705, + "step": 1790 + }, + { + "epoch": 0.9606957685231644, + "grad_norm": 7.772687838765346, + "learning_rate": 8.593957163144141e-07, + "logits/chosen": -0.03184313327074051, + "logits/rejected": 0.1284581869840622, + "logps/chosen": -2.3338072299957275, + "logps/rejected": -2.968812942504883, + "loss": 0.5771, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -2.3338072299957275, + "rewards/margins": 0.6350058913230896, + "rewards/rejected": -2.968812942504883, + "sft_loss": 2.5412683486938477, + "step": 1795 + }, + { + "epoch": 0.963371801304566, + "grad_norm": 8.865298487683116, + "learning_rate": 8.58311228163888e-07, + "logits/chosen": -0.03143691271543503, + "logits/rejected": 0.06218884512782097, + "logps/chosen": -2.478829860687256, + "logps/rejected": -2.9742467403411865, + "loss": 0.6032, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -2.478829860687256, + "rewards/margins": 0.4954164922237396, + "rewards/rejected": -2.9742467403411865, + "sft_loss": 2.5903961658477783, + "step": 1800 + }, + { + "epoch": 0.9660478340859675, + "grad_norm": 11.02813751654619, + "learning_rate": 8.57223263676255e-07, + "logits/chosen": -0.13884618878364563, + "logits/rejected": 0.01260617095977068, + "logps/chosen": -2.3413610458374023, + "logps/rejected": -3.1076226234436035, + "loss": 0.5275, + "rewards/accuracies": 0.75, + "rewards/chosen": -2.3413610458374023, + "rewards/margins": 0.7662616968154907, + "rewards/rejected": -3.1076226234436035, + "sft_loss": 2.465789318084717, + "step": 1805 + }, + { + "epoch": 0.9687238668673691, + "grad_norm": 9.841681672444608, + "learning_rate": 8.561318334069511e-07, + "logits/chosen": -0.0021059750579297543, + "logits/rejected": 0.15467438101768494, + "logps/chosen": -2.4257614612579346, + "logps/rejected": -3.022879123687744, + "loss": 0.573, + "rewards/accuracies": 0.7562500238418579, + "rewards/chosen": -2.4257614612579346, + "rewards/margins": 0.5971179008483887, + "rewards/rejected": -3.022879123687744, + "sft_loss": 2.515134811401367, + "step": 1810 + }, + { + "epoch": 0.9713998996487707, + "grad_norm": 9.870310295591283, + "learning_rate": 8.550369479450375e-07, + "logits/chosen": -0.04107099026441574, + "logits/rejected": 0.12032093107700348, + "logps/chosen": -2.587172031402588, + "logps/rejected": -3.255514144897461, + "loss": 0.5651, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -2.587172031402588, + "rewards/margins": 0.6683421730995178, + "rewards/rejected": -3.255514144897461, + "sft_loss": 2.731900453567505, + "step": 1815 + }, + { + "epoch": 0.9740759324301723, + "grad_norm": 13.783549029664979, + "learning_rate": 8.539386179130977e-07, + "logits/chosen": 0.013371935114264488, + "logits/rejected": 0.07807054370641708, + "logps/chosen": -2.649529218673706, + "logps/rejected": -3.255265712738037, + "loss": 0.5839, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -2.649529218673706, + "rewards/margins": 0.6057366728782654, + "rewards/rejected": -3.255265712738037, + "sft_loss": 2.7066922187805176, + "step": 1820 + }, + { + "epoch": 0.9767519652115738, + "grad_norm": 10.932540143596936, + "learning_rate": 8.528368539671347e-07, + "logits/chosen": -0.08376463502645493, + "logits/rejected": 0.10879947245121002, + "logps/chosen": -2.476346492767334, + "logps/rejected": -3.3493971824645996, + "loss": 0.5491, + "rewards/accuracies": 0.71875, + "rewards/chosen": -2.476346492767334, + "rewards/margins": 0.8730506896972656, + "rewards/rejected": -3.3493971824645996, + "sft_loss": 2.656287908554077, + "step": 1825 + }, + { + "epoch": 0.9794279979929754, + "grad_norm": 11.332806366276783, + "learning_rate": 8.51731666796467e-07, + "logits/chosen": 0.06971029192209244, + "logits/rejected": 0.1344754993915558, + "logps/chosen": -2.857238292694092, + "logps/rejected": -3.4709160327911377, + "loss": 0.5962, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -2.857238292694092, + "rewards/margins": 0.613677442073822, + "rewards/rejected": -3.4709160327911377, + "sft_loss": 2.995389461517334, + "step": 1830 + }, + { + "epoch": 0.982104030774377, + "grad_norm": 12.236061486827184, + "learning_rate": 8.506230671236254e-07, + "logits/chosen": -0.05031920596957207, + "logits/rejected": 0.04754648730158806, + "logps/chosen": -2.865511417388916, + "logps/rejected": -3.368098497390747, + "loss": 0.6121, + "rewards/accuracies": 0.6875, + "rewards/chosen": -2.865511417388916, + "rewards/margins": 0.5025866627693176, + "rewards/rejected": -3.368098497390747, + "sft_loss": 3.0100932121276855, + "step": 1835 + }, + { + "epoch": 0.9847800635557785, + "grad_norm": 10.42805969789397, + "learning_rate": 8.495110657042488e-07, + "logits/chosen": 0.007626816630363464, + "logits/rejected": 0.1920599341392517, + "logps/chosen": -3.0169951915740967, + "logps/rejected": -3.7608039379119873, + "loss": 0.5544, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -3.0169951915740967, + "rewards/margins": 0.7438088655471802, + "rewards/rejected": -3.7608039379119873, + "sft_loss": 3.2395145893096924, + "step": 1840 + }, + { + "epoch": 0.9874560963371801, + "grad_norm": 18.67805455718663, + "learning_rate": 8.483956733269799e-07, + "logits/chosen": -0.031309835612773895, + "logits/rejected": 0.08653328567743301, + "logps/chosen": -3.035778045654297, + "logps/rejected": -3.7503821849823, + "loss": 0.5996, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -3.035778045654297, + "rewards/margins": 0.7146042585372925, + "rewards/rejected": -3.7503821849823, + "sft_loss": 3.326650619506836, + "step": 1845 + }, + { + "epoch": 0.9901321291185817, + "grad_norm": 15.850563785858194, + "learning_rate": 8.472769008133602e-07, + "logits/chosen": -0.18987888097763062, + "logits/rejected": -0.025769507512450218, + "logps/chosen": -3.2544639110565186, + "logps/rejected": -3.8407649993896484, + "loss": 0.6206, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -3.2544639110565186, + "rewards/margins": 0.5863012671470642, + "rewards/rejected": -3.8407649993896484, + "sft_loss": 3.4116978645324707, + "step": 1850 + }, + { + "epoch": 0.9928081618999832, + "grad_norm": 13.542824139028458, + "learning_rate": 8.461547590177259e-07, + "logits/chosen": -0.07165570557117462, + "logits/rejected": 0.09050522744655609, + "logps/chosen": -3.0256094932556152, + "logps/rejected": -3.739541530609131, + "loss": 0.6175, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -3.0256094932556152, + "rewards/margins": 0.7139323353767395, + "rewards/rejected": -3.739541530609131, + "sft_loss": 3.331972599029541, + "step": 1855 + }, + { + "epoch": 0.9954841946813848, + "grad_norm": 14.004127538179633, + "learning_rate": 8.450292588271014e-07, + "logits/chosen": -0.059675056487321854, + "logits/rejected": 0.08022672683000565, + "logps/chosen": -3.128673791885376, + "logps/rejected": -3.766014814376831, + "loss": 0.5919, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -3.128673791885376, + "rewards/margins": 0.6373409032821655, + "rewards/rejected": -3.766014814376831, + "sft_loss": 3.2620816230773926, + "step": 1860 + }, + { + "epoch": 0.9981602274627864, + "grad_norm": 13.526350959371076, + "learning_rate": 8.439004111610945e-07, + "logits/chosen": -0.06711464375257492, + "logits/rejected": 0.0229620523750782, + "logps/chosen": -2.7440898418426514, + "logps/rejected": -3.351423978805542, + "loss": 0.6055, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -2.7440898418426514, + "rewards/margins": 0.6073340773582458, + "rewards/rejected": -3.351423978805542, + "sft_loss": 2.9087913036346436, + "step": 1865 + }, + { + "epoch": 1.000836260244188, + "grad_norm": 11.344183671884279, + "learning_rate": 8.427682269717901e-07, + "logits/chosen": -0.10571829974651337, + "logits/rejected": 0.05615962669253349, + "logps/chosen": -2.7725422382354736, + "logps/rejected": -3.4601359367370605, + "loss": 0.5566, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -2.7725422382354736, + "rewards/margins": 0.6875935792922974, + "rewards/rejected": -3.4601359367370605, + "sft_loss": 2.911839246749878, + "step": 1870 + }, + { + "epoch": 1.0035122930255895, + "grad_norm": 11.766277223905877, + "learning_rate": 8.416327172436446e-07, + "logits/chosen": -0.16360296308994293, + "logits/rejected": -0.006053599528968334, + "logps/chosen": -2.719698667526245, + "logps/rejected": -3.208998441696167, + "loss": 0.6163, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -2.719698667526245, + "rewards/margins": 0.48930010199546814, + "rewards/rejected": -3.208998441696167, + "sft_loss": 2.786731719970703, + "step": 1875 + }, + { + "epoch": 1.0061883258069912, + "grad_norm": 11.76137503676261, + "learning_rate": 8.404938929933778e-07, + "logits/chosen": -0.015264851041138172, + "logits/rejected": 0.15779271721839905, + "logps/chosen": -2.614290952682495, + "logps/rejected": -3.4814743995666504, + "loss": 0.515, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -2.614290952682495, + "rewards/margins": 0.8671833276748657, + "rewards/rejected": -3.4814743995666504, + "sft_loss": 2.8025989532470703, + "step": 1880 + }, + { + "epoch": 1.0088643585883927, + "grad_norm": 9.733281383374747, + "learning_rate": 8.39351765269868e-07, + "logits/chosen": -0.08949162811040878, + "logits/rejected": 0.004027103073894978, + "logps/chosen": -2.561148166656494, + "logps/rejected": -3.174879789352417, + "loss": 0.6062, + "rewards/accuracies": 0.6875, + "rewards/chosen": -2.561148166656494, + "rewards/margins": 0.6137315034866333, + "rewards/rejected": -3.174879789352417, + "sft_loss": 2.714580774307251, + "step": 1885 + }, + { + "epoch": 1.0115403913697942, + "grad_norm": 13.196842268820903, + "learning_rate": 8.382063451540431e-07, + "logits/chosen": -0.08461443334817886, + "logits/rejected": 0.16043774783611298, + "logps/chosen": -2.725468397140503, + "logps/rejected": -3.4064719676971436, + "loss": 0.5576, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": -2.725468397140503, + "rewards/margins": 0.6810039281845093, + "rewards/rejected": -3.4064719676971436, + "sft_loss": 2.99931001663208, + "step": 1890 + }, + { + "epoch": 1.014216424151196, + "grad_norm": 10.52990082304153, + "learning_rate": 8.370576437587742e-07, + "logits/chosen": -0.03379444777965546, + "logits/rejected": 0.010876533575356007, + "logps/chosen": -2.713357448577881, + "logps/rejected": -3.2912967205047607, + "loss": 0.5837, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -2.713357448577881, + "rewards/margins": 0.5779389142990112, + "rewards/rejected": -3.2912967205047607, + "sft_loss": 2.830855369567871, + "step": 1895 + }, + { + "epoch": 1.0168924569325974, + "grad_norm": 9.671101865530032, + "learning_rate": 8.359056722287674e-07, + "logits/chosen": -0.1582552045583725, + "logits/rejected": 0.1438843011856079, + "logps/chosen": -2.714686870574951, + "logps/rejected": -3.3958747386932373, + "loss": 0.5765, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -2.714686870574951, + "rewards/margins": 0.6811872720718384, + "rewards/rejected": -3.3958747386932373, + "sft_loss": 2.9313085079193115, + "step": 1900 + }, + { + "epoch": 1.019568489713999, + "grad_norm": 8.817897366753009, + "learning_rate": 8.347504417404553e-07, + "logits/chosen": -0.03570292145013809, + "logits/rejected": 0.1323067843914032, + "logps/chosen": -2.7149858474731445, + "logps/rejected": -3.3264877796173096, + "loss": 0.6033, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -2.7149858474731445, + "rewards/margins": 0.6115021109580994, + "rewards/rejected": -3.3264877796173096, + "sft_loss": 2.8391032218933105, + "step": 1905 + }, + { + "epoch": 1.0222445224954007, + "grad_norm": 9.514633202698919, + "learning_rate": 8.335919635018893e-07, + "logits/chosen": -0.16271661221981049, + "logits/rejected": -0.003299406263977289, + "logps/chosen": -2.598848581314087, + "logps/rejected": -3.1364142894744873, + "loss": 0.5932, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -2.598848581314087, + "rewards/margins": 0.5375655889511108, + "rewards/rejected": -3.1364142894744873, + "sft_loss": 2.7851195335388184, + "step": 1910 + }, + { + "epoch": 1.0249205552768021, + "grad_norm": 9.029115764201192, + "learning_rate": 8.324302487526303e-07, + "logits/chosen": -0.09520978480577469, + "logits/rejected": 0.021425556391477585, + "logps/chosen": -2.681286334991455, + "logps/rejected": -3.2381675243377686, + "loss": 0.5917, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -2.681286334991455, + "rewards/margins": 0.5568810701370239, + "rewards/rejected": -3.2381675243377686, + "sft_loss": 2.8256735801696777, + "step": 1915 + }, + { + "epoch": 1.0275965880582036, + "grad_norm": 7.915959995679739, + "learning_rate": 8.312653087636398e-07, + "logits/chosen": -0.12838247418403625, + "logits/rejected": -0.032163430005311966, + "logps/chosen": -2.5076041221618652, + "logps/rejected": -3.203805923461914, + "loss": 0.5654, + "rewards/accuracies": 0.7562500238418579, + "rewards/chosen": -2.5076041221618652, + "rewards/margins": 0.6962019205093384, + "rewards/rejected": -3.203805923461914, + "sft_loss": 2.7093334197998047, + "step": 1920 + }, + { + "epoch": 1.0302726208396054, + "grad_norm": 12.434460694472266, + "learning_rate": 8.300971548371711e-07, + "logits/chosen": -0.21484927833080292, + "logits/rejected": 0.02849237620830536, + "logps/chosen": -2.7233753204345703, + "logps/rejected": -3.3476593494415283, + "loss": 0.5705, + "rewards/accuracies": 0.7562500238418579, + "rewards/chosen": -2.7233753204345703, + "rewards/margins": 0.624284029006958, + "rewards/rejected": -3.3476593494415283, + "sft_loss": 2.8742966651916504, + "step": 1925 + }, + { + "epoch": 1.0329486536210069, + "grad_norm": 13.463123545165725, + "learning_rate": 8.289257983066582e-07, + "logits/chosen": -0.14655368030071259, + "logits/rejected": 0.009099148213863373, + "logps/chosen": -2.6282095909118652, + "logps/rejected": -3.3094677925109863, + "loss": 0.555, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -2.6282095909118652, + "rewards/margins": 0.6812580227851868, + "rewards/rejected": -3.3094677925109863, + "sft_loss": 2.856292486190796, + "step": 1930 + }, + { + "epoch": 1.0356246864024083, + "grad_norm": 15.604488504855244, + "learning_rate": 8.277512505366077e-07, + "logits/chosen": -0.18012908101081848, + "logits/rejected": 0.05511124059557915, + "logps/chosen": -2.7234902381896973, + "logps/rejected": -3.4398033618927, + "loss": 0.5666, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -2.7234902381896973, + "rewards/margins": 0.7163132429122925, + "rewards/rejected": -3.4398033618927, + "sft_loss": 2.827632427215576, + "step": 1935 + }, + { + "epoch": 1.03830071918381, + "grad_norm": 9.738078522357188, + "learning_rate": 8.265735229224868e-07, + "logits/chosen": -0.10542136430740356, + "logits/rejected": 0.033803604543209076, + "logps/chosen": -2.6598103046417236, + "logps/rejected": -3.473357677459717, + "loss": 0.5437, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -2.6598103046417236, + "rewards/margins": 0.8135476112365723, + "rewards/rejected": -3.473357677459717, + "sft_loss": 2.6809260845184326, + "step": 1940 + }, + { + "epoch": 1.0409767519652116, + "grad_norm": 7.820147815952518, + "learning_rate": 8.253926268906144e-07, + "logits/chosen": -0.19088909029960632, + "logits/rejected": -0.005916637368500233, + "logps/chosen": -2.7106292247772217, + "logps/rejected": -3.611953020095825, + "loss": 0.5106, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -2.7106292247772217, + "rewards/margins": 0.9013236165046692, + "rewards/rejected": -3.611953020095825, + "sft_loss": 2.822298526763916, + "step": 1945 + }, + { + "epoch": 1.043652784746613, + "grad_norm": 11.916728732236763, + "learning_rate": 8.242085738980487e-07, + "logits/chosen": -0.09535542875528336, + "logits/rejected": 0.16036078333854675, + "logps/chosen": -2.753713369369507, + "logps/rejected": -3.4758613109588623, + "loss": 0.5628, + "rewards/accuracies": 0.75, + "rewards/chosen": -2.753713369369507, + "rewards/margins": 0.7221483588218689, + "rewards/rejected": -3.4758613109588623, + "sft_loss": 2.8452465534210205, + "step": 1950 + }, + { + "epoch": 1.0463288175280148, + "grad_norm": 12.036744464835651, + "learning_rate": 8.230213754324772e-07, + "logits/chosen": -0.12327710539102554, + "logits/rejected": -0.037312399595975876, + "logps/chosen": -2.7057414054870605, + "logps/rejected": -3.308201313018799, + "loss": 0.5719, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -2.7057414054870605, + "rewards/margins": 0.602459728717804, + "rewards/rejected": -3.308201313018799, + "sft_loss": 2.8045592308044434, + "step": 1955 + }, + { + "epoch": 1.0490048503094163, + "grad_norm": 11.974887547568208, + "learning_rate": 8.218310430121045e-07, + "logits/chosen": -0.10618670284748077, + "logits/rejected": -0.06071647256612778, + "logps/chosen": -2.6146535873413086, + "logps/rejected": -3.202754259109497, + "loss": 0.5934, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -2.6146535873413086, + "rewards/margins": 0.5881003737449646, + "rewards/rejected": -3.202754259109497, + "sft_loss": 2.719392776489258, + "step": 1960 + }, + { + "epoch": 1.051680883090818, + "grad_norm": 9.87186260378657, + "learning_rate": 8.20637588185541e-07, + "logits/chosen": -0.06013760715723038, + "logits/rejected": 0.028408635407686234, + "logps/chosen": -2.637633800506592, + "logps/rejected": -3.5640335083007812, + "loss": 0.5077, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -2.637633800506592, + "rewards/margins": 0.926399827003479, + "rewards/rejected": -3.5640335083007812, + "sft_loss": 2.814918041229248, + "step": 1965 + }, + { + "epoch": 1.0543569158722195, + "grad_norm": 8.711974662345696, + "learning_rate": 8.194410225316906e-07, + "logits/chosen": -0.13953541219234467, + "logits/rejected": 0.04353134706616402, + "logps/chosen": -2.487884521484375, + "logps/rejected": -3.113614320755005, + "loss": 0.5679, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -2.487884521484375, + "rewards/margins": 0.6257299780845642, + "rewards/rejected": -3.113614320755005, + "sft_loss": 2.620357036590576, + "step": 1970 + }, + { + "epoch": 1.057032948653621, + "grad_norm": 11.306716862032234, + "learning_rate": 8.182413576596385e-07, + "logits/chosen": 0.0013474032748490572, + "logits/rejected": 0.08792434632778168, + "logps/chosen": -2.478005886077881, + "logps/rejected": -3.1275486946105957, + "loss": 0.5732, + "rewards/accuracies": 0.6875, + "rewards/chosen": -2.478005886077881, + "rewards/margins": 0.6495428085327148, + "rewards/rejected": -3.1275486946105957, + "sft_loss": 2.6386067867279053, + "step": 1975 + }, + { + "epoch": 1.0597089814350227, + "grad_norm": 11.784456427703102, + "learning_rate": 8.170386052085389e-07, + "logits/chosen": 0.014350226148962975, + "logits/rejected": 0.14056164026260376, + "logps/chosen": -2.672428607940674, + "logps/rejected": -3.3221981525421143, + "loss": 0.6038, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -2.672428607940674, + "rewards/margins": 0.6497694253921509, + "rewards/rejected": -3.3221981525421143, + "sft_loss": 2.784529685974121, + "step": 1980 + }, + { + "epoch": 1.0623850142164242, + "grad_norm": 10.737430694761672, + "learning_rate": 8.158327768475008e-07, + "logits/chosen": -0.05563334748148918, + "logits/rejected": 0.1250276118516922, + "logps/chosen": -2.6196036338806152, + "logps/rejected": -3.1013073921203613, + "loss": 0.6358, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -2.6196036338806152, + "rewards/margins": 0.48170357942581177, + "rewards/rejected": -3.1013073921203613, + "sft_loss": 2.7226316928863525, + "step": 1985 + }, + { + "epoch": 1.0650610469978257, + "grad_norm": 12.96106628879117, + "learning_rate": 8.146238842754767e-07, + "logits/chosen": -0.1020413488149643, + "logits/rejected": 0.020089680328965187, + "logps/chosen": -2.629999876022339, + "logps/rejected": -3.177628993988037, + "loss": 0.5901, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -2.629999876022339, + "rewards/margins": 0.5476290583610535, + "rewards/rejected": -3.177628993988037, + "sft_loss": 2.7194972038269043, + "step": 1990 + }, + { + "epoch": 1.0677370797792274, + "grad_norm": 13.03319722387789, + "learning_rate": 8.134119392211476e-07, + "logits/chosen": 0.02129880152642727, + "logits/rejected": 0.20938155055046082, + "logps/chosen": -2.5209169387817383, + "logps/rejected": -3.362943172454834, + "loss": 0.5345, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -2.5209169387817383, + "rewards/margins": 0.8420262336730957, + "rewards/rejected": -3.362943172454834, + "sft_loss": 2.6732523441314697, + "step": 1995 + }, + { + "epoch": 1.0704131125606289, + "grad_norm": 16.541188511965675, + "learning_rate": 8.121969534428094e-07, + "logits/chosen": -0.11407023668289185, + "logits/rejected": 0.07046560198068619, + "logps/chosen": -2.650700807571411, + "logps/rejected": -3.288252592086792, + "loss": 0.6195, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -2.650700807571411, + "rewards/margins": 0.6375521421432495, + "rewards/rejected": -3.288252592086792, + "sft_loss": 2.7548530101776123, + "step": 2000 + }, + { + "epoch": 1.0704131125606289, + "eval_logits/chosen": 0.3371480703353882, + "eval_logits/rejected": 0.45726171135902405, + "eval_logps/chosen": -2.545128107070923, + "eval_logps/rejected": -3.2527496814727783, + "eval_loss": 0.5672591328620911, + "eval_rewards/accuracies": 0.7129080295562744, + "eval_rewards/chosen": -2.545128107070923, + "eval_rewards/margins": 0.7076213359832764, + "eval_rewards/rejected": -3.2527496814727783, + "eval_runtime": 49.6983, + "eval_samples_per_second": 27.063, + "eval_sft_loss": 2.7121262550354004, + "eval_steps_per_second": 6.781, + "step": 2000 + }, + { + "epoch": 1.0730891453420304, + "grad_norm": 12.203435349479575, + "learning_rate": 8.109789387282599e-07, + "logits/chosen": -0.03285417705774307, + "logits/rejected": 0.04845278710126877, + "logps/chosen": -2.5501391887664795, + "logps/rejected": -3.1087799072265625, + "loss": 0.6308, + "rewards/accuracies": 0.6875, + "rewards/chosen": -2.5501391887664795, + "rewards/margins": 0.5586405396461487, + "rewards/rejected": -3.1087799072265625, + "sft_loss": 2.6897871494293213, + "step": 2005 + }, + { + "epoch": 1.075765178123432, + "grad_norm": 15.12578662488471, + "learning_rate": 8.097579068946827e-07, + "logits/chosen": -0.0025532320141792297, + "logits/rejected": 0.14059139788150787, + "logps/chosen": -2.384495973587036, + "logps/rejected": -3.02142596244812, + "loss": 0.5619, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -2.384495973587036, + "rewards/margins": 0.636929988861084, + "rewards/rejected": -3.02142596244812, + "sft_loss": 2.541128635406494, + "step": 2010 + }, + { + "epoch": 1.0784412109048336, + "grad_norm": 10.919169894003698, + "learning_rate": 8.085338697885344e-07, + "logits/chosen": -0.020297734066843987, + "logits/rejected": 0.12689833343029022, + "logps/chosen": -2.4526207447052, + "logps/rejected": -3.089092969894409, + "loss": 0.5716, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -2.4526207447052, + "rewards/margins": 0.6364722847938538, + "rewards/rejected": -3.089092969894409, + "sft_loss": 2.522400140762329, + "step": 2015 + }, + { + "epoch": 1.081117243686235, + "grad_norm": 12.437741570509774, + "learning_rate": 8.073068392854282e-07, + "logits/chosen": -0.16363094747066498, + "logits/rejected": 0.08789423853158951, + "logps/chosen": -2.60274600982666, + "logps/rejected": -3.3508548736572266, + "loss": 0.5272, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -2.60274600982666, + "rewards/margins": 0.7481087446212769, + "rewards/rejected": -3.3508548736572266, + "sft_loss": 2.661445379257202, + "step": 2020 + }, + { + "epoch": 1.0837932764676368, + "grad_norm": 9.100173797791186, + "learning_rate": 8.060768272900193e-07, + "logits/chosen": -0.024624507874250412, + "logits/rejected": 0.15064984560012817, + "logps/chosen": -2.4519011974334717, + "logps/rejected": -3.172219753265381, + "loss": 0.5684, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -2.4519011974334717, + "rewards/margins": 0.7203187942504883, + "rewards/rejected": -3.172219753265381, + "sft_loss": 2.6379141807556152, + "step": 2025 + }, + { + "epoch": 1.0864693092490383, + "grad_norm": 9.418736880908407, + "learning_rate": 8.0484384573589e-07, + "logits/chosen": -0.07204429060220718, + "logits/rejected": -0.027175098657608032, + "logps/chosen": -2.4381704330444336, + "logps/rejected": -3.055497169494629, + "loss": 0.5735, + "rewards/accuracies": 0.7562500238418579, + "rewards/chosen": -2.4381704330444336, + "rewards/margins": 0.6173266172409058, + "rewards/rejected": -3.055497169494629, + "sft_loss": 2.570493698120117, + "step": 2030 + }, + { + "epoch": 1.0891453420304398, + "grad_norm": 14.029045781506598, + "learning_rate": 8.03607906585432e-07, + "logits/chosen": -0.09846794605255127, + "logits/rejected": 0.08520887792110443, + "logps/chosen": -2.560640335083008, + "logps/rejected": -3.2208092212677, + "loss": 0.5881, + "rewards/accuracies": 0.71875, + "rewards/chosen": -2.560640335083008, + "rewards/margins": 0.6601688265800476, + "rewards/rejected": -3.2208092212677, + "sft_loss": 2.736283779144287, + "step": 2035 + }, + { + "epoch": 1.0918213748118415, + "grad_norm": 26.009942660064883, + "learning_rate": 8.023690218297329e-07, + "logits/chosen": -0.17696990072727203, + "logits/rejected": -0.08916838467121124, + "logps/chosen": -2.5884757041931152, + "logps/rejected": -3.3237998485565186, + "loss": 0.5634, + "rewards/accuracies": 0.7562500238418579, + "rewards/chosen": -2.5884757041931152, + "rewards/margins": 0.7353242635726929, + "rewards/rejected": -3.3237998485565186, + "sft_loss": 2.683675527572632, + "step": 2040 + }, + { + "epoch": 1.094497407593243, + "grad_norm": 12.131765431140455, + "learning_rate": 8.01127203488458e-07, + "logits/chosen": -0.040083326399326324, + "logits/rejected": 0.01661711558699608, + "logps/chosen": -2.7470474243164062, + "logps/rejected": -3.420219898223877, + "loss": 0.5626, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -2.7470474243164062, + "rewards/margins": 0.6731725335121155, + "rewards/rejected": -3.420219898223877, + "sft_loss": 2.886335849761963, + "step": 2045 + }, + { + "epoch": 1.0971734403746445, + "grad_norm": 13.591583277242973, + "learning_rate": 7.998824636097339e-07, + "logits/chosen": -0.17442944645881653, + "logits/rejected": -0.010418994352221489, + "logps/chosen": -2.714845895767212, + "logps/rejected": -3.4485251903533936, + "loss": 0.5542, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": -2.714845895767212, + "rewards/margins": 0.7336792945861816, + "rewards/rejected": -3.4485251903533936, + "sft_loss": 2.9100375175476074, + "step": 2050 + }, + { + "epoch": 1.0998494731560462, + "grad_norm": 16.109926012977844, + "learning_rate": 7.986348142700328e-07, + "logits/chosen": -0.11374533176422119, + "logits/rejected": 0.06635691970586777, + "logps/chosen": -2.861820936203003, + "logps/rejected": -3.820934772491455, + "loss": 0.5276, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -2.861820936203003, + "rewards/margins": 0.9591139554977417, + "rewards/rejected": -3.820934772491455, + "sft_loss": 3.154653310775757, + "step": 2055 + }, + { + "epoch": 1.1025255059374477, + "grad_norm": 16.53595169896097, + "learning_rate": 7.973842675740539e-07, + "logits/chosen": -0.05169066786766052, + "logits/rejected": 0.02870332822203636, + "logps/chosen": -2.8666768074035645, + "logps/rejected": -3.7568085193634033, + "loss": 0.5237, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": -2.8666768074035645, + "rewards/margins": 0.8901316523551941, + "rewards/rejected": -3.7568085193634033, + "sft_loss": 3.1012189388275146, + "step": 2060 + }, + { + "epoch": 1.1052015387188494, + "grad_norm": 14.205600074036031, + "learning_rate": 7.961308356546066e-07, + "logits/chosen": -0.08631865680217743, + "logits/rejected": 0.09614025056362152, + "logps/chosen": -3.0432827472686768, + "logps/rejected": -3.993431806564331, + "loss": 0.5182, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -3.0432827472686768, + "rewards/margins": 0.9501487016677856, + "rewards/rejected": -3.993431806564331, + "sft_loss": 3.184915065765381, + "step": 2065 + }, + { + "epoch": 1.107877571500251, + "grad_norm": 18.08939313670418, + "learning_rate": 7.948745306724931e-07, + "logits/chosen": -0.08003537356853485, + "logits/rejected": 0.1082262396812439, + "logps/chosen": -2.843501091003418, + "logps/rejected": -3.8439784049987793, + "loss": 0.4818, + "rewards/accuracies": 0.78125, + "rewards/chosen": -2.843501091003418, + "rewards/margins": 1.0004774332046509, + "rewards/rejected": -3.8439784049987793, + "sft_loss": 2.974362850189209, + "step": 2070 + }, + { + "epoch": 1.1105536042816524, + "grad_norm": 17.045952758047306, + "learning_rate": 7.936153648163897e-07, + "logits/chosen": -0.150905042886734, + "logits/rejected": -0.01554001122713089, + "logps/chosen": -3.081815719604492, + "logps/rejected": -3.82598876953125, + "loss": 0.5928, + "rewards/accuracies": 0.71875, + "rewards/chosen": -3.081815719604492, + "rewards/margins": 0.7441730499267578, + "rewards/rejected": -3.82598876953125, + "sft_loss": 3.387814998626709, + "step": 2075 + }, + { + "epoch": 1.1132296370630541, + "grad_norm": 13.732378711313746, + "learning_rate": 7.92353350302729e-07, + "logits/chosen": -0.20131368935108185, + "logits/rejected": 0.002260335488244891, + "logps/chosen": -2.891542911529541, + "logps/rejected": -3.7180428504943848, + "loss": 0.5424, + "rewards/accuracies": 0.7562500238418579, + "rewards/chosen": -2.891542911529541, + "rewards/margins": 0.826499342918396, + "rewards/rejected": -3.7180428504943848, + "sft_loss": 3.1301307678222656, + "step": 2080 + }, + { + "epoch": 1.1159056698444556, + "grad_norm": 16.660204890320408, + "learning_rate": 7.910884993755816e-07, + "logits/chosen": -0.15335850417613983, + "logits/rejected": -0.04589155316352844, + "logps/chosen": -2.9131200313568115, + "logps/rejected": -3.7110915184020996, + "loss": 0.5577, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -2.9131200313568115, + "rewards/margins": 0.7979711294174194, + "rewards/rejected": -3.7110915184020996, + "sft_loss": 3.070864200592041, + "step": 2085 + }, + { + "epoch": 1.118581702625857, + "grad_norm": 13.701172511940262, + "learning_rate": 7.898208243065367e-07, + "logits/chosen": -0.18213477730751038, + "logits/rejected": -0.1654680222272873, + "logps/chosen": -2.819640874862671, + "logps/rejected": -3.414013624191284, + "loss": 0.6072, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -2.819640874862671, + "rewards/margins": 0.5943728685379028, + "rewards/rejected": -3.414013624191284, + "sft_loss": 3.111002206802368, + "step": 2090 + }, + { + "epoch": 1.1212577354072588, + "grad_norm": 16.53325718701653, + "learning_rate": 7.88550337394583e-07, + "logits/chosen": -0.21913953125476837, + "logits/rejected": -0.05338749289512634, + "logps/chosen": -3.027158498764038, + "logps/rejected": -3.6880135536193848, + "loss": 0.5923, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -3.027158498764038, + "rewards/margins": 0.6608546376228333, + "rewards/rejected": -3.6880135536193848, + "sft_loss": 3.1259403228759766, + "step": 2095 + }, + { + "epoch": 1.1239337681886603, + "grad_norm": 15.264267957216644, + "learning_rate": 7.872770509659905e-07, + "logits/chosen": -0.0772733986377716, + "logits/rejected": -0.007263128645718098, + "logps/chosen": -2.935605525970459, + "logps/rejected": -3.6056618690490723, + "loss": 0.5867, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -2.935605525970459, + "rewards/margins": 0.6700563430786133, + "rewards/rejected": -3.6056618690490723, + "sft_loss": 3.046522855758667, + "step": 2100 + }, + { + "epoch": 1.1266098009700618, + "grad_norm": 14.343161073716592, + "learning_rate": 7.860009773741896e-07, + "logits/chosen": -0.064370296895504, + "logits/rejected": 0.11259357631206512, + "logps/chosen": -2.7505581378936768, + "logps/rejected": -3.5925374031066895, + "loss": 0.5111, + "rewards/accuracies": 0.78125, + "rewards/chosen": -2.7505581378936768, + "rewards/margins": 0.8419793844223022, + "rewards/rejected": -3.5925374031066895, + "sft_loss": 2.818540096282959, + "step": 2105 + }, + { + "epoch": 1.1292858337514635, + "grad_norm": 15.151824872919203, + "learning_rate": 7.84722128999652e-07, + "logits/chosen": -0.12127542495727539, + "logits/rejected": 0.044526226818561554, + "logps/chosen": -2.701420307159424, + "logps/rejected": -3.569472551345825, + "loss": 0.5615, + "rewards/accuracies": 0.75, + "rewards/chosen": -2.701420307159424, + "rewards/margins": 0.8680523633956909, + "rewards/rejected": -3.569472551345825, + "sft_loss": 2.8453078269958496, + "step": 2110 + }, + { + "epoch": 1.131961866532865, + "grad_norm": 13.018946550229275, + "learning_rate": 7.834405182497699e-07, + "logits/chosen": -0.039768896996974945, + "logits/rejected": 0.02067652717232704, + "logps/chosen": -2.7957961559295654, + "logps/rejected": -3.5757274627685547, + "loss": 0.5708, + "rewards/accuracies": 0.71875, + "rewards/chosen": -2.7957961559295654, + "rewards/margins": 0.779931366443634, + "rewards/rejected": -3.5757274627685547, + "sft_loss": 2.929673910140991, + "step": 2115 + }, + { + "epoch": 1.1346378993142665, + "grad_norm": 15.20906486976137, + "learning_rate": 7.821561575587368e-07, + "logits/chosen": -0.1733914464712143, + "logits/rejected": -0.12084399163722992, + "logps/chosen": -2.7386631965637207, + "logps/rejected": -3.3611443042755127, + "loss": 0.5547, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -2.7386631965637207, + "rewards/margins": 0.6224810481071472, + "rewards/rejected": -3.3611443042755127, + "sft_loss": 2.9166512489318848, + "step": 2120 + }, + { + "epoch": 1.1373139320956682, + "grad_norm": 10.09351337769588, + "learning_rate": 7.808690593874254e-07, + "logits/chosen": -0.17015834152698517, + "logits/rejected": -0.0607013925909996, + "logps/chosen": -2.801271677017212, + "logps/rejected": -3.549854278564453, + "loss": 0.5552, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -2.801271677017212, + "rewards/margins": 0.7485824823379517, + "rewards/rejected": -3.549854278564453, + "sft_loss": 2.9816415309906006, + "step": 2125 + }, + { + "epoch": 1.1399899648770697, + "grad_norm": 13.731817537809986, + "learning_rate": 7.79579236223268e-07, + "logits/chosen": -0.10042476654052734, + "logits/rejected": 0.1891237497329712, + "logps/chosen": -2.7051987648010254, + "logps/rejected": -3.6020240783691406, + "loss": 0.5231, + "rewards/accuracies": 0.768750011920929, + "rewards/chosen": -2.7051987648010254, + "rewards/margins": 0.8968254923820496, + "rewards/rejected": -3.6020240783691406, + "sft_loss": 2.8918728828430176, + "step": 2130 + }, + { + "epoch": 1.1426659976584714, + "grad_norm": 13.688072489475777, + "learning_rate": 7.782867005801346e-07, + "logits/chosen": -0.11776898056268692, + "logits/rejected": 0.11693098396062851, + "logps/chosen": -2.7273452281951904, + "logps/rejected": -3.5392394065856934, + "loss": 0.5442, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -2.7273452281951904, + "rewards/margins": 0.8118942975997925, + "rewards/rejected": -3.5392394065856934, + "sft_loss": 2.8112258911132812, + "step": 2135 + }, + { + "epoch": 1.145342030439873, + "grad_norm": 17.846087519185875, + "learning_rate": 7.769914649982117e-07, + "logits/chosen": -0.11212320625782013, + "logits/rejected": 0.05280379205942154, + "logps/chosen": -2.698925018310547, + "logps/rejected": -3.4237124919891357, + "loss": 0.5754, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -2.698925018310547, + "rewards/margins": 0.7247874736785889, + "rewards/rejected": -3.4237124919891357, + "sft_loss": 2.8307290077209473, + "step": 2140 + }, + { + "epoch": 1.1480180632212744, + "grad_norm": 14.376053087200567, + "learning_rate": 7.756935420438803e-07, + "logits/chosen": -0.09898360073566437, + "logits/rejected": 0.021031454205513, + "logps/chosen": -2.5822346210479736, + "logps/rejected": -3.566603899002075, + "loss": 0.519, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -2.5822346210479736, + "rewards/margins": 0.9843686819076538, + "rewards/rejected": -3.566603899002075, + "sft_loss": 2.718616008758545, + "step": 2145 + }, + { + "epoch": 1.1506940960026761, + "grad_norm": 11.203256988700122, + "learning_rate": 7.743929443095951e-07, + "logits/chosen": -0.12101199477910995, + "logits/rejected": -0.04032517224550247, + "logps/chosen": -2.7327792644500732, + "logps/rejected": -3.550232410430908, + "loss": 0.5139, + "rewards/accuracies": 0.768750011920929, + "rewards/chosen": -2.7327792644500732, + "rewards/margins": 0.8174529075622559, + "rewards/rejected": -3.550232410430908, + "sft_loss": 2.8194966316223145, + "step": 2150 + }, + { + "epoch": 1.1533701287840776, + "grad_norm": 13.665641869802482, + "learning_rate": 7.730896844137609e-07, + "logits/chosen": -0.06756674498319626, + "logits/rejected": 0.04592638835310936, + "logps/chosen": -2.945727586746216, + "logps/rejected": -3.566514492034912, + "loss": 0.608, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -2.945727586746216, + "rewards/margins": 0.6207873821258545, + "rewards/rejected": -3.566514492034912, + "sft_loss": 3.0970585346221924, + "step": 2155 + }, + { + "epoch": 1.1560461615654791, + "grad_norm": 16.169765648383418, + "learning_rate": 7.717837750006106e-07, + "logits/chosen": -0.09466465562582016, + "logits/rejected": 0.007165367715060711, + "logps/chosen": -2.7117316722869873, + "logps/rejected": -3.6370861530303955, + "loss": 0.5396, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -2.7117316722869873, + "rewards/margins": 0.9253547787666321, + "rewards/rejected": -3.6370861530303955, + "sft_loss": 2.8119845390319824, + "step": 2160 + }, + { + "epoch": 1.1587221943468808, + "grad_norm": 13.68381522374296, + "learning_rate": 7.704752287400832e-07, + "logits/chosen": -0.07723397761583328, + "logits/rejected": 0.1379738301038742, + "logps/chosen": -2.777071475982666, + "logps/rejected": -3.6475002765655518, + "loss": 0.5489, + "rewards/accuracies": 0.75, + "rewards/chosen": -2.777071475982666, + "rewards/margins": 0.8704291582107544, + "rewards/rejected": -3.6475002765655518, + "sft_loss": 2.910811424255371, + "step": 2165 + }, + { + "epoch": 1.1613982271282823, + "grad_norm": 9.552484298341081, + "learning_rate": 7.691640583277004e-07, + "logits/chosen": -0.08300045132637024, + "logits/rejected": 0.10612811893224716, + "logps/chosen": -2.6649529933929443, + "logps/rejected": -3.496000289916992, + "loss": 0.5528, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -2.6649529933929443, + "rewards/margins": 0.8310472369194031, + "rewards/rejected": -3.496000289916992, + "sft_loss": 2.795423984527588, + "step": 2170 + }, + { + "epoch": 1.1640742599096838, + "grad_norm": 10.432443980432863, + "learning_rate": 7.678502764844433e-07, + "logits/chosen": -0.13375064730644226, + "logits/rejected": 0.08962388336658478, + "logps/chosen": -2.653404712677002, + "logps/rejected": -3.332920551300049, + "loss": 0.5569, + "rewards/accuracies": 0.75, + "rewards/chosen": -2.653404712677002, + "rewards/margins": 0.679515540599823, + "rewards/rejected": -3.332920551300049, + "sft_loss": 2.7964444160461426, + "step": 2175 + }, + { + "epoch": 1.1667502926910855, + "grad_norm": 10.949292698095055, + "learning_rate": 7.665338959566288e-07, + "logits/chosen": -0.09721340239048004, + "logits/rejected": 0.022823205217719078, + "logps/chosen": -2.5667824745178223, + "logps/rejected": -3.463876724243164, + "loss": 0.5048, + "rewards/accuracies": 0.78125, + "rewards/chosen": -2.5667824745178223, + "rewards/margins": 0.8970942497253418, + "rewards/rejected": -3.463876724243164, + "sft_loss": 2.7630155086517334, + "step": 2180 + }, + { + "epoch": 1.169426325472487, + "grad_norm": 16.319514759464216, + "learning_rate": 7.652149295157868e-07, + "logits/chosen": -0.019224589690566063, + "logits/rejected": 0.17753568291664124, + "logps/chosen": -2.734881639480591, + "logps/rejected": -3.398794651031494, + "loss": 0.5615, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -2.734881639480591, + "rewards/margins": 0.6639131307601929, + "rewards/rejected": -3.398794651031494, + "sft_loss": 2.8227455615997314, + "step": 2185 + }, + { + "epoch": 1.1721023582538885, + "grad_norm": 11.860038540569555, + "learning_rate": 7.638933899585354e-07, + "logits/chosen": 0.06096430495381355, + "logits/rejected": 0.0971364825963974, + "logps/chosen": -2.660165309906006, + "logps/rejected": -3.4485931396484375, + "loss": 0.5471, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -2.660165309906006, + "rewards/margins": 0.7884277105331421, + "rewards/rejected": -3.4485931396484375, + "sft_loss": 2.973078727722168, + "step": 2190 + }, + { + "epoch": 1.1747783910352902, + "grad_norm": 12.804501417380948, + "learning_rate": 7.625692901064573e-07, + "logits/chosen": -0.04396064206957817, + "logits/rejected": 0.08522314578294754, + "logps/chosen": -2.8927605152130127, + "logps/rejected": -3.7299964427948, + "loss": 0.5893, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -2.8927605152130127, + "rewards/margins": 0.8372361063957214, + "rewards/rejected": -3.7299964427948, + "sft_loss": 3.15350604057312, + "step": 2195 + }, + { + "epoch": 1.1774544238166917, + "grad_norm": 11.705850285187246, + "learning_rate": 7.61242642805975e-07, + "logits/chosen": -0.0731678232550621, + "logits/rejected": -0.07998080551624298, + "logps/chosen": -2.737246036529541, + "logps/rejected": -3.451080322265625, + "loss": 0.5729, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -2.737246036529541, + "rewards/margins": 0.7138343453407288, + "rewards/rejected": -3.451080322265625, + "sft_loss": 2.9744622707366943, + "step": 2200 + }, + { + "epoch": 1.1801304565980932, + "grad_norm": 11.625726585306893, + "learning_rate": 7.599134609282266e-07, + "logits/chosen": -0.15962481498718262, + "logits/rejected": 0.07510614395141602, + "logps/chosen": -2.7328078746795654, + "logps/rejected": -3.451481580734253, + "loss": 0.5604, + "rewards/accuracies": 0.71875, + "rewards/chosen": -2.7328078746795654, + "rewards/margins": 0.7186736464500427, + "rewards/rejected": -3.451481580734253, + "sft_loss": 2.8423523902893066, + "step": 2205 + }, + { + "epoch": 1.182806489379495, + "grad_norm": 11.640982228332257, + "learning_rate": 7.585817573689402e-07, + "logits/chosen": -0.1442055106163025, + "logits/rejected": 0.0014272450935095549, + "logps/chosen": -2.437304735183716, + "logps/rejected": -3.400106906890869, + "loss": 0.492, + "rewards/accuracies": 0.793749988079071, + "rewards/chosen": -2.437304735183716, + "rewards/margins": 0.9628024101257324, + "rewards/rejected": -3.400106906890869, + "sft_loss": 2.632190227508545, + "step": 2210 + }, + { + "epoch": 1.1854825221608964, + "grad_norm": 11.954836265342767, + "learning_rate": 7.572475450483098e-07, + "logits/chosen": -0.13480237126350403, + "logits/rejected": -0.02519914135336876, + "logps/chosen": -2.741669178009033, + "logps/rejected": -3.451484203338623, + "loss": 0.5723, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -2.741669178009033, + "rewards/margins": 0.7098146677017212, + "rewards/rejected": -3.451484203338623, + "sft_loss": 2.809096097946167, + "step": 2215 + }, + { + "epoch": 1.188158554942298, + "grad_norm": 13.788219036304126, + "learning_rate": 7.559108369108689e-07, + "logits/chosen": -0.19939342141151428, + "logits/rejected": -0.054556954652071, + "logps/chosen": -2.576840877532959, + "logps/rejected": -3.2807235717773438, + "loss": 0.5834, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -2.576840877532959, + "rewards/margins": 0.7038829922676086, + "rewards/rejected": -3.2807235717773438, + "sft_loss": 2.7406558990478516, + "step": 2220 + }, + { + "epoch": 1.1908345877236997, + "grad_norm": 11.214294947271048, + "learning_rate": 7.54571645925366e-07, + "logits/chosen": -0.20111767947673798, + "logits/rejected": 0.10563284158706665, + "logps/chosen": -2.5377230644226074, + "logps/rejected": -3.4687418937683105, + "loss": 0.5078, + "rewards/accuracies": 0.768750011920929, + "rewards/chosen": -2.5377230644226074, + "rewards/margins": 0.9310193061828613, + "rewards/rejected": -3.4687418937683105, + "sft_loss": 2.7131781578063965, + "step": 2225 + }, + { + "epoch": 1.1935106205051011, + "grad_norm": 14.038927448722399, + "learning_rate": 7.532299850846378e-07, + "logits/chosen": -0.1545659601688385, + "logits/rejected": 0.048467788845300674, + "logps/chosen": -2.569761037826538, + "logps/rejected": -3.4898669719696045, + "loss": 0.5239, + "rewards/accuracies": 0.7562500238418579, + "rewards/chosen": -2.569761037826538, + "rewards/margins": 0.9201061129570007, + "rewards/rejected": -3.4898669719696045, + "sft_loss": 2.6959354877471924, + "step": 2230 + }, + { + "epoch": 1.1961866532865026, + "grad_norm": 22.202008098046015, + "learning_rate": 7.518858674054838e-07, + "logits/chosen": -0.16763782501220703, + "logits/rejected": 0.07074837386608124, + "logps/chosen": -2.594499111175537, + "logps/rejected": -3.4791977405548096, + "loss": 0.5313, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -2.594499111175537, + "rewards/margins": 0.8846985101699829, + "rewards/rejected": -3.4791977405548096, + "sft_loss": 2.7077810764312744, + "step": 2235 + }, + { + "epoch": 1.1988626860679044, + "grad_norm": 11.638140383512662, + "learning_rate": 7.505393059285394e-07, + "logits/chosen": -0.1514950543642044, + "logits/rejected": 0.05857670307159424, + "logps/chosen": -2.7575697898864746, + "logps/rejected": -3.520163059234619, + "loss": 0.5629, + "rewards/accuracies": 0.75, + "rewards/chosen": -2.7575697898864746, + "rewards/margins": 0.7625933885574341, + "rewards/rejected": -3.520163059234619, + "sft_loss": 2.9731178283691406, + "step": 2240 + }, + { + "epoch": 1.2015387188493059, + "grad_norm": 13.490389163955774, + "learning_rate": 7.491903137181501e-07, + "logits/chosen": -0.06201281026005745, + "logits/rejected": -0.00964923482388258, + "logps/chosen": -2.607433795928955, + "logps/rejected": -3.3961021900177, + "loss": 0.5365, + "rewards/accuracies": 0.768750011920929, + "rewards/chosen": -2.607433795928955, + "rewards/margins": 0.7886683940887451, + "rewards/rejected": -3.3961021900177, + "sft_loss": 2.8041486740112305, + "step": 2245 + }, + { + "epoch": 1.2042147516307076, + "grad_norm": 13.885635892847203, + "learning_rate": 7.478389038622441e-07, + "logits/chosen": 0.021996164694428444, + "logits/rejected": 0.07282289117574692, + "logps/chosen": -2.7160027027130127, + "logps/rejected": -3.5241007804870605, + "loss": 0.5597, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": -2.7160027027130127, + "rewards/margins": 0.8080977201461792, + "rewards/rejected": -3.5241007804870605, + "sft_loss": 2.846470594406128, + "step": 2250 + }, + { + "epoch": 1.206890784412109, + "grad_norm": 17.65495152779927, + "learning_rate": 7.46485089472206e-07, + "logits/chosen": -0.07781971246004105, + "logits/rejected": 0.011274236254394054, + "logps/chosen": -2.707087993621826, + "logps/rejected": -3.4074695110321045, + "loss": 0.6023, + "rewards/accuracies": 0.71875, + "rewards/chosen": -2.707087993621826, + "rewards/margins": 0.7003816366195679, + "rewards/rejected": -3.4074695110321045, + "sft_loss": 2.811058521270752, + "step": 2255 + }, + { + "epoch": 1.2095668171935106, + "grad_norm": 12.9064548583628, + "learning_rate": 7.451288836827487e-07, + "logits/chosen": -0.034752242267131805, + "logits/rejected": -0.03955184295773506, + "logps/chosen": -2.5647547245025635, + "logps/rejected": -3.176440954208374, + "loss": 0.5846, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -2.5647547245025635, + "rewards/margins": 0.6116862893104553, + "rewards/rejected": -3.176440954208374, + "sft_loss": 2.714322328567505, + "step": 2260 + }, + { + "epoch": 1.2122428499749123, + "grad_norm": 11.576293376589549, + "learning_rate": 7.437702996517869e-07, + "logits/chosen": -0.0587887242436409, + "logits/rejected": 0.057128388434648514, + "logps/chosen": -2.555610179901123, + "logps/rejected": -3.3883907794952393, + "loss": 0.5334, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -2.555610179901123, + "rewards/margins": 0.8327801823616028, + "rewards/rejected": -3.3883907794952393, + "sft_loss": 2.7125325202941895, + "step": 2265 + }, + { + "epoch": 1.2149188827563138, + "grad_norm": 16.459688569958313, + "learning_rate": 7.424093505603087e-07, + "logits/chosen": -0.21486875414848328, + "logits/rejected": 0.0009250387665815651, + "logps/chosen": -2.632251501083374, + "logps/rejected": -3.5194313526153564, + "loss": 0.5191, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -2.632251501083374, + "rewards/margins": 0.8871792554855347, + "rewards/rejected": -3.5194313526153564, + "sft_loss": 2.7042503356933594, + "step": 2270 + }, + { + "epoch": 1.2175949155377153, + "grad_norm": 12.02172575643201, + "learning_rate": 7.410460496122482e-07, + "logits/chosen": -0.11882486194372177, + "logits/rejected": 0.061978042125701904, + "logps/chosen": -2.511664867401123, + "logps/rejected": -3.44714093208313, + "loss": 0.5004, + "rewards/accuracies": 0.78125, + "rewards/chosen": -2.511664867401123, + "rewards/margins": 0.9354757070541382, + "rewards/rejected": -3.44714093208313, + "sft_loss": 2.6550300121307373, + "step": 2275 + }, + { + "epoch": 1.220270948319117, + "grad_norm": 12.938920832651148, + "learning_rate": 7.396804100343572e-07, + "logits/chosen": -0.21035528182983398, + "logits/rejected": 0.024009039625525475, + "logps/chosen": -2.495543956756592, + "logps/rejected": -3.2615630626678467, + "loss": 0.5356, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -2.495543956756592, + "rewards/margins": 0.7660187482833862, + "rewards/rejected": -3.2615630626678467, + "sft_loss": 2.665358304977417, + "step": 2280 + }, + { + "epoch": 1.2229469811005185, + "grad_norm": 10.660246082448193, + "learning_rate": 7.383124450760768e-07, + "logits/chosen": -0.1773698329925537, + "logits/rejected": 0.060549378395080566, + "logps/chosen": -2.765693426132202, + "logps/rejected": -3.6653060913085938, + "loss": 0.5215, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": -2.765693426132202, + "rewards/margins": 0.8996122479438782, + "rewards/rejected": -3.6653060913085938, + "sft_loss": 2.8947532176971436, + "step": 2285 + }, + { + "epoch": 1.22562301388192, + "grad_norm": 11.952228684288013, + "learning_rate": 7.369421680094091e-07, + "logits/chosen": -0.25170189142227173, + "logits/rejected": -0.06527785211801529, + "logps/chosen": -2.6405110359191895, + "logps/rejected": -3.569425106048584, + "loss": 0.5445, + "rewards/accuracies": 0.7562500238418579, + "rewards/chosen": -2.6405110359191895, + "rewards/margins": 0.9289140701293945, + "rewards/rejected": -3.569425106048584, + "sft_loss": 2.804739236831665, + "step": 2290 + }, + { + "epoch": 1.2282990466633217, + "grad_norm": 14.512637560124094, + "learning_rate": 7.355695921287881e-07, + "logits/chosen": -0.18567724525928497, + "logits/rejected": -0.07959018647670746, + "logps/chosen": -2.851104259490967, + "logps/rejected": -3.550295352935791, + "loss": 0.604, + "rewards/accuracies": 0.6875, + "rewards/chosen": -2.851104259490967, + "rewards/margins": 0.6991909742355347, + "rewards/rejected": -3.550295352935791, + "sft_loss": 3.084052324295044, + "step": 2295 + }, + { + "epoch": 1.2309750794447232, + "grad_norm": 17.14862593143908, + "learning_rate": 7.341947307509513e-07, + "logits/chosen": -0.14178717136383057, + "logits/rejected": 0.025927435606718063, + "logps/chosen": -2.7473275661468506, + "logps/rejected": -3.539985179901123, + "loss": 0.584, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -2.7473275661468506, + "rewards/margins": 0.7926574945449829, + "rewards/rejected": -3.539985179901123, + "sft_loss": 2.9406330585479736, + "step": 2300 + }, + { + "epoch": 1.233651112226125, + "grad_norm": 14.567551554858559, + "learning_rate": 7.328175972148094e-07, + "logits/chosen": -0.12181315571069717, + "logits/rejected": 0.027345454320311546, + "logps/chosen": -2.9586310386657715, + "logps/rejected": -3.7822742462158203, + "loss": 0.5585, + "rewards/accuracies": 0.75, + "rewards/chosen": -2.9586310386657715, + "rewards/margins": 0.8236430883407593, + "rewards/rejected": -3.7822742462158203, + "sft_loss": 3.0613818168640137, + "step": 2305 + }, + { + "epoch": 1.2363271450075264, + "grad_norm": 14.46195645390294, + "learning_rate": 7.314382048813185e-07, + "logits/chosen": -0.07838133722543716, + "logits/rejected": 0.24125882983207703, + "logps/chosen": -2.6525206565856934, + "logps/rejected": -3.5842769145965576, + "loss": 0.5071, + "rewards/accuracies": 0.793749988079071, + "rewards/chosen": -2.6525206565856934, + "rewards/margins": 0.9317564964294434, + "rewards/rejected": -3.5842769145965576, + "sft_loss": 2.7743771076202393, + "step": 2310 + }, + { + "epoch": 1.2390031777889279, + "grad_norm": 12.667723186738346, + "learning_rate": 7.300565671333486e-07, + "logits/chosen": -0.16325172781944275, + "logits/rejected": 0.09019894897937775, + "logps/chosen": -2.7748184204101562, + "logps/rejected": -3.6521975994110107, + "loss": 0.5157, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -2.7748184204101562, + "rewards/margins": 0.8773792386054993, + "rewards/rejected": -3.6521975994110107, + "sft_loss": 2.919607400894165, + "step": 2315 + }, + { + "epoch": 1.2416792105703296, + "grad_norm": 9.821970272156106, + "learning_rate": 7.286726973755554e-07, + "logits/chosen": -0.0405716709792614, + "logits/rejected": 0.012422848492860794, + "logps/chosen": -2.7273387908935547, + "logps/rejected": -3.563554048538208, + "loss": 0.5221, + "rewards/accuracies": 0.768750011920929, + "rewards/chosen": -2.7273387908935547, + "rewards/margins": 0.8362150192260742, + "rewards/rejected": -3.563554048538208, + "sft_loss": 2.8721890449523926, + "step": 2320 + }, + { + "epoch": 1.244355243351731, + "grad_norm": 11.2576869894753, + "learning_rate": 7.272866090342493e-07, + "logits/chosen": 0.014239540323615074, + "logits/rejected": 0.10396864265203476, + "logps/chosen": -2.7915005683898926, + "logps/rejected": -3.6830050945281982, + "loss": 0.5032, + "rewards/accuracies": 0.78125, + "rewards/chosen": -2.7915005683898926, + "rewards/margins": 0.8915045857429504, + "rewards/rejected": -3.6830050945281982, + "sft_loss": 2.8475332260131836, + "step": 2325 + }, + { + "epoch": 1.2470312761331326, + "grad_norm": 14.886226508028688, + "learning_rate": 7.258983155572656e-07, + "logits/chosen": -0.1416918933391571, + "logits/rejected": -0.013908380642533302, + "logps/chosen": -2.7864866256713867, + "logps/rejected": -3.65608549118042, + "loss": 0.5579, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -2.7864866256713867, + "rewards/margins": 0.8695987462997437, + "rewards/rejected": -3.65608549118042, + "sft_loss": 3.0220208168029785, + "step": 2330 + }, + { + "epoch": 1.2497073089145343, + "grad_norm": 11.78564603000481, + "learning_rate": 7.245078304138335e-07, + "logits/chosen": -0.04754914715886116, + "logits/rejected": 0.0625465139746666, + "logps/chosen": -2.9272003173828125, + "logps/rejected": -3.7455573081970215, + "loss": 0.5553, + "rewards/accuracies": 0.75, + "rewards/chosen": -2.9272003173828125, + "rewards/margins": 0.8183562159538269, + "rewards/rejected": -3.7455573081970215, + "sft_loss": 3.047041416168213, + "step": 2335 + }, + { + "epoch": 1.2523833416959358, + "grad_norm": 11.619307815290895, + "learning_rate": 7.231151670944462e-07, + "logits/chosen": -0.22953256964683533, + "logits/rejected": 0.018267208710312843, + "logps/chosen": -2.9490160942077637, + "logps/rejected": -3.734051465988159, + "loss": 0.5658, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -2.9490160942077637, + "rewards/margins": 0.7850354313850403, + "rewards/rejected": -3.734051465988159, + "sft_loss": 3.06479811668396, + "step": 2340 + }, + { + "epoch": 1.2550593744773373, + "grad_norm": 11.88694919613225, + "learning_rate": 7.217203391107291e-07, + "logits/chosen": -0.15705767273902893, + "logits/rejected": 0.05266653373837471, + "logps/chosen": -2.8118395805358887, + "logps/rejected": -3.7154057025909424, + "loss": 0.5396, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -2.8118395805358887, + "rewards/margins": 0.9035660028457642, + "rewards/rejected": -3.7154057025909424, + "sft_loss": 2.9647486209869385, + "step": 2345 + }, + { + "epoch": 1.257735407258739, + "grad_norm": 11.194729124579853, + "learning_rate": 7.203233599953096e-07, + "logits/chosen": -0.11801191419363022, + "logits/rejected": 0.08426766842603683, + "logps/chosen": -2.867255449295044, + "logps/rejected": -3.711289882659912, + "loss": 0.5432, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": -2.867255449295044, + "rewards/margins": 0.8440347909927368, + "rewards/rejected": -3.711289882659912, + "sft_loss": 2.996035099029541, + "step": 2350 + }, + { + "epoch": 1.2604114400401405, + "grad_norm": 15.396629241648583, + "learning_rate": 7.189242433016852e-07, + "logits/chosen": -0.07606076449155807, + "logits/rejected": 0.08415577560663223, + "logps/chosen": -2.6616203784942627, + "logps/rejected": -3.712498188018799, + "loss": 0.5173, + "rewards/accuracies": 0.78125, + "rewards/chosen": -2.6616203784942627, + "rewards/margins": 1.0508776903152466, + "rewards/rejected": -3.712498188018799, + "sft_loss": 2.822685480117798, + "step": 2355 + }, + { + "epoch": 1.263087472821542, + "grad_norm": 15.779769905906738, + "learning_rate": 7.17523002604092e-07, + "logits/chosen": -0.1172393336892128, + "logits/rejected": 0.0867505595088005, + "logps/chosen": -2.902477979660034, + "logps/rejected": -3.823699474334717, + "loss": 0.5439, + "rewards/accuracies": 0.7562500238418579, + "rewards/chosen": -2.902477979660034, + "rewards/margins": 0.9212223291397095, + "rewards/rejected": -3.823699474334717, + "sft_loss": 3.1079139709472656, + "step": 2360 + }, + { + "epoch": 1.2657635056029437, + "grad_norm": 14.826732984520188, + "learning_rate": 7.161196514973734e-07, + "logits/chosen": -0.0951496809720993, + "logits/rejected": 0.10926233232021332, + "logps/chosen": -2.767425775527954, + "logps/rejected": -3.7156193256378174, + "loss": 0.5329, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -2.767425775527954, + "rewards/margins": 0.9481937289237976, + "rewards/rejected": -3.7156193256378174, + "sft_loss": 2.93123197555542, + "step": 2365 + }, + { + "epoch": 1.2684395383843452, + "grad_norm": 15.424680439322838, + "learning_rate": 7.147142035968483e-07, + "logits/chosen": -0.05707361549139023, + "logits/rejected": 0.14730793237686157, + "logps/chosen": -2.9719011783599854, + "logps/rejected": -3.8209996223449707, + "loss": 0.5497, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -2.9719011783599854, + "rewards/margins": 0.8490983247756958, + "rewards/rejected": -3.8209996223449707, + "sft_loss": 3.1319332122802734, + "step": 2370 + }, + { + "epoch": 1.2711155711657467, + "grad_norm": 16.015488945324908, + "learning_rate": 7.133066725381781e-07, + "logits/chosen": -0.19962266087532043, + "logits/rejected": 0.032320786267519, + "logps/chosen": -2.7625584602355957, + "logps/rejected": -3.652691602706909, + "loss": 0.5326, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -2.7625584602355957, + "rewards/margins": 0.8901335597038269, + "rewards/rejected": -3.652691602706909, + "sft_loss": 2.890148639678955, + "step": 2375 + }, + { + "epoch": 1.2737916039471484, + "grad_norm": 13.416203673025201, + "learning_rate": 7.118970719772354e-07, + "logits/chosen": -0.1521248072385788, + "logits/rejected": 0.05423276498913765, + "logps/chosen": -2.9326486587524414, + "logps/rejected": -3.9643986225128174, + "loss": 0.5213, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -2.9326486587524414, + "rewards/margins": 1.0317497253417969, + "rewards/rejected": -3.9643986225128174, + "sft_loss": 3.080808162689209, + "step": 2380 + }, + { + "epoch": 1.27646763672855, + "grad_norm": 16.355261452513187, + "learning_rate": 7.104854155899711e-07, + "logits/chosen": -0.056324996054172516, + "logits/rejected": 0.09308980405330658, + "logps/chosen": -2.9391448497772217, + "logps/rejected": -3.81658673286438, + "loss": 0.5471, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -2.9391448497772217, + "rewards/margins": 0.8774418830871582, + "rewards/rejected": -3.81658673286438, + "sft_loss": 3.0427327156066895, + "step": 2385 + }, + { + "epoch": 1.2791436695099514, + "grad_norm": 15.660593397945485, + "learning_rate": 7.090717170722817e-07, + "logits/chosen": -0.07351523637771606, + "logits/rejected": 0.0036463707219809294, + "logps/chosen": -2.8803794384002686, + "logps/rejected": -3.9230926036834717, + "loss": 0.4815, + "rewards/accuracies": 0.7562500238418579, + "rewards/chosen": -2.8803794384002686, + "rewards/margins": 1.0427131652832031, + "rewards/rejected": -3.9230926036834717, + "sft_loss": 3.0287270545959473, + "step": 2390 + }, + { + "epoch": 1.2818197022913531, + "grad_norm": 13.5979207659121, + "learning_rate": 7.076559901398762e-07, + "logits/chosen": -0.2557370960712433, + "logits/rejected": -0.08898217976093292, + "logps/chosen": -2.7281875610351562, + "logps/rejected": -3.4912161827087402, + "loss": 0.5508, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -2.7281875610351562, + "rewards/margins": 0.7630285024642944, + "rewards/rejected": -3.4912161827087402, + "sft_loss": 2.8979766368865967, + "step": 2395 + }, + { + "epoch": 1.2844957350727546, + "grad_norm": 21.81683023409898, + "learning_rate": 7.062382485281436e-07, + "logits/chosen": -0.17235462367534637, + "logits/rejected": -0.011511986143887043, + "logps/chosen": -2.8245368003845215, + "logps/rejected": -3.442866086959839, + "loss": 0.5895, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -2.8245368003845215, + "rewards/margins": 0.6183292269706726, + "rewards/rejected": -3.442866086959839, + "sft_loss": 2.9664804935455322, + "step": 2400 + }, + { + "epoch": 1.2844957350727546, + "eval_logits/chosen": 0.21740129590034485, + "eval_logits/rejected": 0.3361586034297943, + "eval_logps/chosen": -2.896199941635132, + "eval_logps/rejected": -3.7486469745635986, + "eval_loss": 0.5589631199836731, + "eval_rewards/accuracies": 0.7321958541870117, + "eval_rewards/chosen": -2.896199941635132, + "eval_rewards/margins": 0.8524471521377563, + "eval_rewards/rejected": -3.7486469745635986, + "eval_runtime": 49.6629, + "eval_samples_per_second": 27.083, + "eval_sft_loss": 3.063127279281616, + "eval_steps_per_second": 6.786, + "step": 2400 + }, + { + "epoch": 1.287171767854156, + "grad_norm": 10.169197101778261, + "learning_rate": 7.048185059920193e-07, + "logits/chosen": -0.13088330626487732, + "logits/rejected": 0.033510565757751465, + "logps/chosen": -2.812943935394287, + "logps/rejected": -3.8146815299987793, + "loss": 0.5221, + "rewards/accuracies": 0.7562500238418579, + "rewards/chosen": -2.812943935394287, + "rewards/margins": 1.0017378330230713, + "rewards/rejected": -3.8146815299987793, + "sft_loss": 2.9715335369110107, + "step": 2405 + }, + { + "epoch": 1.2898478006355578, + "grad_norm": 15.283932968431893, + "learning_rate": 7.033967763058516e-07, + "logits/chosen": -0.21596041321754456, + "logits/rejected": 0.0054006329737603664, + "logps/chosen": -2.8387198448181152, + "logps/rejected": -3.5503010749816895, + "loss": 0.5535, + "rewards/accuracies": 0.7562500238418579, + "rewards/chosen": -2.8387198448181152, + "rewards/margins": 0.7115810513496399, + "rewards/rejected": -3.5503010749816895, + "sft_loss": 2.988356113433838, + "step": 2410 + }, + { + "epoch": 1.2925238334169593, + "grad_norm": 12.239190981465617, + "learning_rate": 7.019730732632681e-07, + "logits/chosen": -0.06232795864343643, + "logits/rejected": 0.05048118159174919, + "logps/chosen": -2.8041563034057617, + "logps/rejected": -3.745462417602539, + "loss": 0.5402, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -2.8041563034057617, + "rewards/margins": 0.941305935382843, + "rewards/rejected": -3.745462417602539, + "sft_loss": 2.9846417903900146, + "step": 2415 + }, + { + "epoch": 1.2951998661983608, + "grad_norm": 10.981673187841457, + "learning_rate": 7.005474106770418e-07, + "logits/chosen": -0.18740686774253845, + "logits/rejected": -0.04007058963179588, + "logps/chosen": -2.7511918544769287, + "logps/rejected": -3.655221939086914, + "loss": 0.5035, + "rewards/accuracies": 0.768750011920929, + "rewards/chosen": -2.7511918544769287, + "rewards/margins": 0.9040305018424988, + "rewards/rejected": -3.655221939086914, + "sft_loss": 2.9835357666015625, + "step": 2420 + }, + { + "epoch": 1.2978758989797625, + "grad_norm": 11.537702916654943, + "learning_rate": 6.991198023789577e-07, + "logits/chosen": -0.10398707538843155, + "logits/rejected": 0.002530190395191312, + "logps/chosen": -2.594005584716797, + "logps/rejected": -3.307220935821533, + "loss": 0.5364, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": -2.594005584716797, + "rewards/margins": 0.7132154703140259, + "rewards/rejected": -3.307220935821533, + "sft_loss": 2.844587802886963, + "step": 2425 + }, + { + "epoch": 1.300551931761164, + "grad_norm": 14.650164088227385, + "learning_rate": 6.976902622196776e-07, + "logits/chosen": -0.13141386210918427, + "logits/rejected": -0.04375090077519417, + "logps/chosen": -2.753594160079956, + "logps/rejected": -3.594632625579834, + "loss": 0.5434, + "rewards/accuracies": 0.7562500238418579, + "rewards/chosen": -2.753594160079956, + "rewards/margins": 0.8410388231277466, + "rewards/rejected": -3.594632625579834, + "sft_loss": 2.9324729442596436, + "step": 2430 + }, + { + "epoch": 1.3032279645425655, + "grad_norm": 12.054791406137, + "learning_rate": 6.962588040686064e-07, + "logits/chosen": -0.13357993960380554, + "logits/rejected": 0.04774869233369827, + "logps/chosen": -2.727971315383911, + "logps/rejected": -3.395655870437622, + "loss": 0.6061, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -2.727971315383911, + "rewards/margins": 0.6676840782165527, + "rewards/rejected": -3.395655870437622, + "sft_loss": 2.8951988220214844, + "step": 2435 + }, + { + "epoch": 1.3059039973239672, + "grad_norm": 13.956191863559187, + "learning_rate": 6.948254418137573e-07, + "logits/chosen": -0.152082160115242, + "logits/rejected": 0.005078119225800037, + "logps/chosen": -2.735032558441162, + "logps/rejected": -3.594900131225586, + "loss": 0.5572, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -2.735032558441162, + "rewards/margins": 0.8598672747612, + "rewards/rejected": -3.594900131225586, + "sft_loss": 2.8514065742492676, + "step": 2440 + }, + { + "epoch": 1.3085800301053687, + "grad_norm": 17.880626952480096, + "learning_rate": 6.933901893616174e-07, + "logits/chosen": -0.18042948842048645, + "logits/rejected": 0.001165248453617096, + "logps/chosen": -2.8338892459869385, + "logps/rejected": -3.564720630645752, + "loss": 0.5791, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -2.8338892459869385, + "rewards/margins": 0.7308312654495239, + "rewards/rejected": -3.564720630645752, + "sft_loss": 2.9869532585144043, + "step": 2445 + }, + { + "epoch": 1.3112560628867704, + "grad_norm": 17.936263693494023, + "learning_rate": 6.919530606370121e-07, + "logits/chosen": -0.11919529736042023, + "logits/rejected": 0.06414364278316498, + "logps/chosen": -2.6797900199890137, + "logps/rejected": -3.5703368186950684, + "loss": 0.5222, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -2.6797900199890137, + "rewards/margins": 0.8905467987060547, + "rewards/rejected": -3.5703368186950684, + "sft_loss": 2.842641592025757, + "step": 2450 + }, + { + "epoch": 1.313932095668172, + "grad_norm": 11.778029290115247, + "learning_rate": 6.905140695829706e-07, + "logits/chosen": -0.2211964875459671, + "logits/rejected": 0.10638616979122162, + "logps/chosen": -2.8779773712158203, + "logps/rejected": -3.729775905609131, + "loss": 0.5211, + "rewards/accuracies": 0.7562500238418579, + "rewards/chosen": -2.8779773712158203, + "rewards/margins": 0.8517991304397583, + "rewards/rejected": -3.729775905609131, + "sft_loss": 2.9841933250427246, + "step": 2455 + }, + { + "epoch": 1.3166081284495736, + "grad_norm": 21.232965650496542, + "learning_rate": 6.890732301605904e-07, + "logits/chosen": -0.1100403293967247, + "logits/rejected": 0.023740727454423904, + "logps/chosen": -2.82916522026062, + "logps/rejected": -3.5652172565460205, + "loss": 0.5753, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -2.82916522026062, + "rewards/margins": 0.7360517382621765, + "rewards/rejected": -3.5652172565460205, + "sft_loss": 2.94977068901062, + "step": 2460 + }, + { + "epoch": 1.3192841612309751, + "grad_norm": 13.816982282247295, + "learning_rate": 6.876305563489021e-07, + "logits/chosen": -0.13921356201171875, + "logits/rejected": -0.0037501081824302673, + "logps/chosen": -3.0031192302703857, + "logps/rejected": -4.045921802520752, + "loss": 0.4954, + "rewards/accuracies": 0.78125, + "rewards/chosen": -3.0031192302703857, + "rewards/margins": 1.0428025722503662, + "rewards/rejected": -4.045921802520752, + "sft_loss": 3.080378770828247, + "step": 2465 + }, + { + "epoch": 1.3219601940123766, + "grad_norm": 17.284787525126063, + "learning_rate": 6.861860621447331e-07, + "logits/chosen": -0.24131083488464355, + "logits/rejected": -0.09177212417125702, + "logps/chosen": -3.058800220489502, + "logps/rejected": -3.75386118888855, + "loss": 0.5922, + "rewards/accuracies": 0.71875, + "rewards/chosen": -3.058800220489502, + "rewards/margins": 0.6950610280036926, + "rewards/rejected": -3.75386118888855, + "sft_loss": 3.283332347869873, + "step": 2470 + }, + { + "epoch": 1.3246362267937783, + "grad_norm": 16.4037189947279, + "learning_rate": 6.847397615625725e-07, + "logits/chosen": -0.05851219221949577, + "logits/rejected": 0.018921365961432457, + "logps/chosen": -3.0433387756347656, + "logps/rejected": -3.802337646484375, + "loss": 0.563, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": -3.0433387756347656, + "rewards/margins": 0.7589989900588989, + "rewards/rejected": -3.802337646484375, + "sft_loss": 3.2078189849853516, + "step": 2475 + }, + { + "epoch": 1.3273122595751798, + "grad_norm": 12.783966343392903, + "learning_rate": 6.83291668634435e-07, + "logits/chosen": -0.23455576598644257, + "logits/rejected": 0.007899327203631401, + "logps/chosen": -2.9427683353424072, + "logps/rejected": -3.9789185523986816, + "loss": 0.5023, + "rewards/accuracies": 0.78125, + "rewards/chosen": -2.9427683353424072, + "rewards/margins": 1.0361502170562744, + "rewards/rejected": -3.9789185523986816, + "sft_loss": 3.2492759227752686, + "step": 2480 + }, + { + "epoch": 1.3299882923565813, + "grad_norm": 14.533663119876245, + "learning_rate": 6.818417974097246e-07, + "logits/chosen": 0.0017811127472668886, + "logits/rejected": 0.1938328891992569, + "logps/chosen": -2.8244447708129883, + "logps/rejected": -3.9046339988708496, + "loss": 0.5096, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": -2.8244447708129883, + "rewards/margins": 1.0801894664764404, + "rewards/rejected": -3.9046339988708496, + "sft_loss": 3.1318705081939697, + "step": 2485 + }, + { + "epoch": 1.332664325137983, + "grad_norm": 13.898077604356237, + "learning_rate": 6.803901619550981e-07, + "logits/chosen": -0.16150932013988495, + "logits/rejected": -0.07232292741537094, + "logps/chosen": -2.845247983932495, + "logps/rejected": -3.7361931800842285, + "loss": 0.5081, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -2.845247983932495, + "rewards/margins": 0.89094477891922, + "rewards/rejected": -3.7361931800842285, + "sft_loss": 3.053283214569092, + "step": 2490 + }, + { + "epoch": 1.3353403579193845, + "grad_norm": 15.257198775978162, + "learning_rate": 6.789367763543292e-07, + "logits/chosen": -0.09079436212778091, + "logits/rejected": -0.049105338752269745, + "logps/chosen": -2.799342632293701, + "logps/rejected": -3.526744842529297, + "loss": 0.5961, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -2.799342632293701, + "rewards/margins": 0.7274022102355957, + "rewards/rejected": -3.526744842529297, + "sft_loss": 3.0104057788848877, + "step": 2495 + }, + { + "epoch": 1.338016390700786, + "grad_norm": 12.661016212031267, + "learning_rate": 6.774816547081714e-07, + "logits/chosen": -0.08757440000772476, + "logits/rejected": 0.11619249731302261, + "logps/chosen": -2.7563064098358154, + "logps/rejected": -3.429248809814453, + "loss": 0.56, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -2.7563064098358154, + "rewards/margins": 0.6729423999786377, + "rewards/rejected": -3.429248809814453, + "sft_loss": 2.9776875972747803, + "step": 2500 + }, + { + "epoch": 1.3406924234821878, + "grad_norm": 11.3369202045845, + "learning_rate": 6.760248111342211e-07, + "logits/chosen": -0.11019454151391983, + "logits/rejected": 0.09622526913881302, + "logps/chosen": -2.58648419380188, + "logps/rejected": -3.4114139080047607, + "loss": 0.5313, + "rewards/accuracies": 0.75, + "rewards/chosen": -2.58648419380188, + "rewards/margins": 0.8249297142028809, + "rewards/rejected": -3.4114139080047607, + "sft_loss": 2.7188868522644043, + "step": 2505 + }, + { + "epoch": 1.3433684562635893, + "grad_norm": 11.930669828343557, + "learning_rate": 6.745662597667813e-07, + "logits/chosen": -0.18405844271183014, + "logits/rejected": -0.00788338202983141, + "logps/chosen": -2.5469136238098145, + "logps/rejected": -3.428063154220581, + "loss": 0.504, + "rewards/accuracies": 0.75, + "rewards/chosen": -2.5469136238098145, + "rewards/margins": 0.88114994764328, + "rewards/rejected": -3.428063154220581, + "sft_loss": 2.7372565269470215, + "step": 2510 + }, + { + "epoch": 1.3460444890449907, + "grad_norm": 13.572330915296204, + "learning_rate": 6.731060147567236e-07, + "logits/chosen": -0.07396905869245529, + "logits/rejected": 0.04314111918210983, + "logps/chosen": -2.5699820518493652, + "logps/rejected": -3.4824867248535156, + "loss": 0.5057, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -2.5699820518493652, + "rewards/margins": 0.9125045537948608, + "rewards/rejected": -3.4824867248535156, + "sft_loss": 2.786505699157715, + "step": 2515 + }, + { + "epoch": 1.3487205218263925, + "grad_norm": 13.632209758213088, + "learning_rate": 6.716440902713515e-07, + "logits/chosen": -0.1771983802318573, + "logits/rejected": -0.0806611105799675, + "logps/chosen": -2.7183549404144287, + "logps/rejected": -3.488394260406494, + "loss": 0.5214, + "rewards/accuracies": 0.75, + "rewards/chosen": -2.7183549404144287, + "rewards/margins": 0.7700392007827759, + "rewards/rejected": -3.488394260406494, + "sft_loss": 2.750516414642334, + "step": 2520 + }, + { + "epoch": 1.351396554607794, + "grad_norm": 16.99291377369377, + "learning_rate": 6.701805004942627e-07, + "logits/chosen": -0.16392597556114197, + "logits/rejected": -0.068141408264637, + "logps/chosen": -2.8260693550109863, + "logps/rejected": -3.63167142868042, + "loss": 0.5541, + "rewards/accuracies": 0.71875, + "rewards/chosen": -2.8260693550109863, + "rewards/margins": 0.8056022524833679, + "rewards/rejected": -3.63167142868042, + "sft_loss": 3.0697553157806396, + "step": 2525 + }, + { + "epoch": 1.3540725873891954, + "grad_norm": 17.881106078580785, + "learning_rate": 6.687152596252119e-07, + "logits/chosen": -0.13811075687408447, + "logits/rejected": -0.05306004732847214, + "logps/chosen": -2.8596765995025635, + "logps/rejected": -3.5510857105255127, + "loss": 0.6078, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -2.8596765995025635, + "rewards/margins": 0.6914095878601074, + "rewards/rejected": -3.5510857105255127, + "sft_loss": 3.048046112060547, + "step": 2530 + }, + { + "epoch": 1.3567486201705972, + "grad_norm": 14.497796493869934, + "learning_rate": 6.672483818799722e-07, + "logits/chosen": -0.22085972130298615, + "logits/rejected": -0.03556264936923981, + "logps/chosen": -2.8032827377319336, + "logps/rejected": -3.651709794998169, + "loss": 0.5368, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -2.8032827377319336, + "rewards/margins": 0.848427414894104, + "rewards/rejected": -3.651709794998169, + "sft_loss": 2.9784340858459473, + "step": 2535 + }, + { + "epoch": 1.3594246529519987, + "grad_norm": 16.23050143054709, + "learning_rate": 6.657798814901978e-07, + "logits/chosen": -0.18565736711025238, + "logits/rejected": 0.02686428092420101, + "logps/chosen": -2.8961615562438965, + "logps/rejected": -3.6319243907928467, + "loss": 0.5747, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -2.8961615562438965, + "rewards/margins": 0.7357627749443054, + "rewards/rejected": -3.6319243907928467, + "sft_loss": 3.0940470695495605, + "step": 2540 + }, + { + "epoch": 1.3621006857334002, + "grad_norm": 14.795515293991508, + "learning_rate": 6.643097727032863e-07, + "logits/chosen": -0.20490925014019012, + "logits/rejected": 0.03412886708974838, + "logps/chosen": -2.802849292755127, + "logps/rejected": -3.7249903678894043, + "loss": 0.5106, + "rewards/accuracies": 0.7562500238418579, + "rewards/chosen": -2.802849292755127, + "rewards/margins": 0.9221410751342773, + "rewards/rejected": -3.7249903678894043, + "sft_loss": 2.9522287845611572, + "step": 2545 + }, + { + "epoch": 1.3647767185148019, + "grad_norm": 13.311934939525496, + "learning_rate": 6.628380697822392e-07, + "logits/chosen": -0.16037167608737946, + "logits/rejected": 0.039409227669239044, + "logps/chosen": -2.8980062007904053, + "logps/rejected": -3.645191192626953, + "loss": 0.5538, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -2.8980062007904053, + "rewards/margins": 0.7471850514411926, + "rewards/rejected": -3.645191192626953, + "sft_loss": 3.0559191703796387, + "step": 2550 + }, + { + "epoch": 1.3674527512962034, + "grad_norm": 19.606325206974198, + "learning_rate": 6.61364787005525e-07, + "logits/chosen": -0.12282273918390274, + "logits/rejected": 0.030886346474289894, + "logps/chosen": -2.7401130199432373, + "logps/rejected": -3.7287514209747314, + "loss": 0.5452, + "rewards/accuracies": 0.75, + "rewards/chosen": -2.7401130199432373, + "rewards/margins": 0.988638699054718, + "rewards/rejected": -3.7287514209747314, + "sft_loss": 2.9616169929504395, + "step": 2555 + }, + { + "epoch": 1.3701287840776049, + "grad_norm": 18.41661304276396, + "learning_rate": 6.598899386669395e-07, + "logits/chosen": -0.11560845375061035, + "logits/rejected": 0.032873429358005524, + "logps/chosen": -2.8558077812194824, + "logps/rejected": -3.606076717376709, + "loss": 0.5747, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -2.8558077812194824, + "rewards/margins": 0.7502694129943848, + "rewards/rejected": -3.606076717376709, + "sft_loss": 2.979921817779541, + "step": 2560 + }, + { + "epoch": 1.3728048168590066, + "grad_norm": 20.287732924933724, + "learning_rate": 6.584135390754679e-07, + "logits/chosen": -0.14809802174568176, + "logits/rejected": 0.026063639670610428, + "logps/chosen": -2.7513139247894287, + "logps/rejected": -3.6620700359344482, + "loss": 0.5366, + "rewards/accuracies": 0.75, + "rewards/chosen": -2.7513139247894287, + "rewards/margins": 0.9107562899589539, + "rewards/rejected": -3.6620700359344482, + "sft_loss": 2.9361300468444824, + "step": 2565 + }, + { + "epoch": 1.375480849640408, + "grad_norm": 9.657853877904172, + "learning_rate": 6.569356025551454e-07, + "logits/chosen": -0.10108263790607452, + "logits/rejected": -0.0015401586424559355, + "logps/chosen": -2.671628952026367, + "logps/rejected": -3.570929765701294, + "loss": 0.5249, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -2.671628952026367, + "rewards/margins": 0.8993002772331238, + "rewards/rejected": -3.570929765701294, + "sft_loss": 2.78222918510437, + "step": 2570 + }, + { + "epoch": 1.3781568824218096, + "grad_norm": 12.249073564214969, + "learning_rate": 6.554561434449186e-07, + "logits/chosen": -0.2132205218076706, + "logits/rejected": -0.01806102879345417, + "logps/chosen": -2.6709792613983154, + "logps/rejected": -3.5337345600128174, + "loss": 0.5402, + "rewards/accuracies": 0.75, + "rewards/chosen": -2.6709792613983154, + "rewards/margins": 0.8627556562423706, + "rewards/rejected": -3.5337345600128174, + "sft_loss": 2.8137805461883545, + "step": 2575 + }, + { + "epoch": 1.3808329152032113, + "grad_norm": 18.244963709261334, + "learning_rate": 6.539751760985063e-07, + "logits/chosen": -0.15387776494026184, + "logits/rejected": -0.03826805576682091, + "logps/chosen": -2.8103771209716797, + "logps/rejected": -3.4488766193389893, + "loss": 0.5825, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -2.8103771209716797, + "rewards/margins": 0.6384997963905334, + "rewards/rejected": -3.4488766193389893, + "sft_loss": 2.9886860847473145, + "step": 2580 + }, + { + "epoch": 1.3835089479846128, + "grad_norm": 11.372926174115369, + "learning_rate": 6.524927148842602e-07, + "logits/chosen": -0.051814544945955276, + "logits/rejected": 0.14694175124168396, + "logps/chosen": -2.651075839996338, + "logps/rejected": -3.5731492042541504, + "loss": 0.505, + "rewards/accuracies": 0.7562500238418579, + "rewards/chosen": -2.651075839996338, + "rewards/margins": 0.9220731854438782, + "rewards/rejected": -3.5731492042541504, + "sft_loss": 2.774786949157715, + "step": 2585 + }, + { + "epoch": 1.3861849807660143, + "grad_norm": 16.02538304897232, + "learning_rate": 6.510087741850254e-07, + "logits/chosen": -0.15899457037448883, + "logits/rejected": -0.0023765608202666044, + "logps/chosen": -2.590641498565674, + "logps/rejected": -3.3977553844451904, + "loss": 0.5546, + "rewards/accuracies": 0.71875, + "rewards/chosen": -2.590641498565674, + "rewards/margins": 0.8071142435073853, + "rewards/rejected": -3.3977553844451904, + "sft_loss": 2.838257074356079, + "step": 2590 + }, + { + "epoch": 1.388861013547416, + "grad_norm": 13.562954231672789, + "learning_rate": 6.495233683980012e-07, + "logits/chosen": -0.13240444660186768, + "logits/rejected": -0.061270572245121, + "logps/chosen": -2.760969638824463, + "logps/rejected": -3.5130043029785156, + "loss": 0.5545, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -2.760969638824463, + "rewards/margins": 0.7520343661308289, + "rewards/rejected": -3.5130043029785156, + "sft_loss": 2.855792999267578, + "step": 2595 + }, + { + "epoch": 1.3915370463288175, + "grad_norm": 15.00286944204792, + "learning_rate": 6.480365119346011e-07, + "logits/chosen": -0.03778408467769623, + "logits/rejected": 0.13565590977668762, + "logps/chosen": -2.7521817684173584, + "logps/rejected": -3.5368714332580566, + "loss": 0.5358, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -2.7521817684173584, + "rewards/margins": 0.7846895456314087, + "rewards/rejected": -3.5368714332580566, + "sft_loss": 2.9168272018432617, + "step": 2600 + }, + { + "epoch": 1.394213079110219, + "grad_norm": 13.34637580930508, + "learning_rate": 6.465482192203129e-07, + "logits/chosen": -0.018246622756123543, + "logits/rejected": 0.07032772898674011, + "logps/chosen": -2.7906553745269775, + "logps/rejected": -3.6011791229248047, + "loss": 0.5307, + "rewards/accuracies": 0.7562500238418579, + "rewards/chosen": -2.7906553745269775, + "rewards/margins": 0.8105236887931824, + "rewards/rejected": -3.6011791229248047, + "sft_loss": 3.0532116889953613, + "step": 2605 + }, + { + "epoch": 1.3968891118916207, + "grad_norm": 24.566874069177967, + "learning_rate": 6.45058504694559e-07, + "logits/chosen": -0.021194588392972946, + "logits/rejected": 0.04253927990794182, + "logps/chosen": -2.8475394248962402, + "logps/rejected": -3.7430832386016846, + "loss": 0.5366, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -2.8475394248962402, + "rewards/margins": 0.8955437541007996, + "rewards/rejected": -3.7430832386016846, + "sft_loss": 3.0145957469940186, + "step": 2610 + }, + { + "epoch": 1.3995651446730222, + "grad_norm": 22.213587799390993, + "learning_rate": 6.435673828105564e-07, + "logits/chosen": -0.13955837488174438, + "logits/rejected": 0.028280243277549744, + "logps/chosen": -2.7947468757629395, + "logps/rejected": -3.759547710418701, + "loss": 0.5256, + "rewards/accuracies": 0.75, + "rewards/chosen": -2.7947468757629395, + "rewards/margins": 0.9648011922836304, + "rewards/rejected": -3.759547710418701, + "sft_loss": 3.0343430042266846, + "step": 2615 + }, + { + "epoch": 1.402241177454424, + "grad_norm": 16.48086056628069, + "learning_rate": 6.420748680351763e-07, + "logits/chosen": -0.05696401000022888, + "logits/rejected": -0.09101828187704086, + "logps/chosen": -2.867499828338623, + "logps/rejected": -3.5520408153533936, + "loss": 0.5869, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -2.867499828338623, + "rewards/margins": 0.6845409274101257, + "rewards/rejected": -3.5520408153533936, + "sft_loss": 3.1165659427642822, + "step": 2620 + }, + { + "epoch": 1.4049172102358254, + "grad_norm": 24.259711379457958, + "learning_rate": 6.405809748488032e-07, + "logits/chosen": -0.14697353541851044, + "logits/rejected": 0.04247549921274185, + "logps/chosen": -2.9035472869873047, + "logps/rejected": -3.8812873363494873, + "loss": 0.5561, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": -2.9035472869873047, + "rewards/margins": 0.9777399301528931, + "rewards/rejected": -3.8812873363494873, + "sft_loss": 3.024491310119629, + "step": 2625 + }, + { + "epoch": 1.4075932430172269, + "grad_norm": 13.22720645000827, + "learning_rate": 6.390857177451956e-07, + "logits/chosen": -0.24623966217041016, + "logits/rejected": -0.02795444056391716, + "logps/chosen": -2.919384241104126, + "logps/rejected": -3.8018722534179688, + "loss": 0.5353, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -2.919384241104126, + "rewards/margins": 0.882487952709198, + "rewards/rejected": -3.8018722534179688, + "sft_loss": 3.0856080055236816, + "step": 2630 + }, + { + "epoch": 1.4102692757986286, + "grad_norm": 16.391043845520393, + "learning_rate": 6.375891112313445e-07, + "logits/chosen": -0.2169569432735443, + "logits/rejected": -0.0841716006398201, + "logps/chosen": -2.943756580352783, + "logps/rejected": -3.8317313194274902, + "loss": 0.5336, + "rewards/accuracies": 0.7562500238418579, + "rewards/chosen": -2.943756580352783, + "rewards/margins": 0.887974739074707, + "rewards/rejected": -3.8317313194274902, + "sft_loss": 3.124804973602295, + "step": 2635 + }, + { + "epoch": 1.41294530858003, + "grad_norm": 16.571457287546348, + "learning_rate": 6.360911698273326e-07, + "logits/chosen": -0.14374461770057678, + "logits/rejected": 0.007679411675781012, + "logps/chosen": -3.0234484672546387, + "logps/rejected": -3.7668967247009277, + "loss": 0.575, + "rewards/accuracies": 0.75, + "rewards/chosen": -3.0234484672546387, + "rewards/margins": 0.7434485554695129, + "rewards/rejected": -3.7668967247009277, + "sft_loss": 3.1646761894226074, + "step": 2640 + }, + { + "epoch": 1.4156213413614318, + "grad_norm": 15.362718817664941, + "learning_rate": 6.345919080661944e-07, + "logits/chosen": -0.14833353459835052, + "logits/rejected": -0.04470009729266167, + "logps/chosen": -2.707223892211914, + "logps/rejected": -3.673488140106201, + "loss": 0.4991, + "rewards/accuracies": 0.8062499761581421, + "rewards/chosen": -2.707223892211914, + "rewards/margins": 0.9662643671035767, + "rewards/rejected": -3.673488140106201, + "sft_loss": 2.848567485809326, + "step": 2645 + }, + { + "epoch": 1.4182973741428333, + "grad_norm": 13.379887652672586, + "learning_rate": 6.330913404937737e-07, + "logits/chosen": -0.21039438247680664, + "logits/rejected": -0.033975306898355484, + "logps/chosen": -2.8401217460632324, + "logps/rejected": -3.8723747730255127, + "loss": 0.5236, + "rewards/accuracies": 0.768750011920929, + "rewards/chosen": -2.8401217460632324, + "rewards/margins": 1.032252550125122, + "rewards/rejected": -3.8723747730255127, + "sft_loss": 2.9957895278930664, + "step": 2650 + }, + { + "epoch": 1.4209734069242348, + "grad_norm": 15.738869793942214, + "learning_rate": 6.315894816685838e-07, + "logits/chosen": -0.15092626214027405, + "logits/rejected": 0.03545082360506058, + "logps/chosen": -2.785090208053589, + "logps/rejected": -3.711087465286255, + "loss": 0.4916, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -2.785090208053589, + "rewards/margins": 0.9259971380233765, + "rewards/rejected": -3.711087465286255, + "sft_loss": 3.063774585723877, + "step": 2655 + }, + { + "epoch": 1.4236494397056365, + "grad_norm": 14.72542003850956, + "learning_rate": 6.300863461616657e-07, + "logits/chosen": -0.11617982387542725, + "logits/rejected": -0.014439836144447327, + "logps/chosen": -2.838334560394287, + "logps/rejected": -3.5807464122772217, + "loss": 0.6031, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -2.838334560394287, + "rewards/margins": 0.7424119710922241, + "rewards/rejected": -3.5807464122772217, + "sft_loss": 3.023712635040283, + "step": 2660 + }, + { + "epoch": 1.426325472487038, + "grad_norm": 12.299955789891118, + "learning_rate": 6.285819485564465e-07, + "logits/chosen": -0.2647419273853302, + "logits/rejected": -0.07578029483556747, + "logps/chosen": -2.828758716583252, + "logps/rejected": -3.6691207885742188, + "loss": 0.517, + "rewards/accuracies": 0.78125, + "rewards/chosen": -2.828758716583252, + "rewards/margins": 0.8403621912002563, + "rewards/rejected": -3.6691207885742188, + "sft_loss": 3.0283281803131104, + "step": 2665 + }, + { + "epoch": 1.4290015052684395, + "grad_norm": 14.38249715299272, + "learning_rate": 6.270763034485986e-07, + "logits/chosen": -0.07366399466991425, + "logits/rejected": 0.05366234853863716, + "logps/chosen": -2.936784267425537, + "logps/rejected": -3.766112804412842, + "loss": 0.5265, + "rewards/accuracies": 0.793749988079071, + "rewards/chosen": -2.936784267425537, + "rewards/margins": 0.8293284177780151, + "rewards/rejected": -3.766112804412842, + "sft_loss": 3.0304007530212402, + "step": 2670 + }, + { + "epoch": 1.4316775380498412, + "grad_norm": 21.464765721123907, + "learning_rate": 6.255694254458972e-07, + "logits/chosen": -0.13733455538749695, + "logits/rejected": 0.061878375709056854, + "logps/chosen": -2.890540838241577, + "logps/rejected": -3.669931411743164, + "loss": 0.5913, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -2.890540838241577, + "rewards/margins": 0.7793907523155212, + "rewards/rejected": -3.669931411743164, + "sft_loss": 2.913121461868286, + "step": 2675 + }, + { + "epoch": 1.4343535708312427, + "grad_norm": 22.08383332497907, + "learning_rate": 6.240613291680795e-07, + "logits/chosen": -0.20060577988624573, + "logits/rejected": 0.010845445096492767, + "logps/chosen": -2.7031121253967285, + "logps/rejected": -3.4497039318084717, + "loss": 0.5899, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -2.7031121253967285, + "rewards/margins": 0.7465916872024536, + "rewards/rejected": -3.4497039318084717, + "sft_loss": 2.8224806785583496, + "step": 2680 + }, + { + "epoch": 1.4370296036126442, + "grad_norm": 11.542264244056486, + "learning_rate": 6.225520292467021e-07, + "logits/chosen": -0.2530103027820587, + "logits/rejected": 0.029706627130508423, + "logps/chosen": -2.5867018699645996, + "logps/rejected": -3.494511842727661, + "loss": 0.4921, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -2.5867018699645996, + "rewards/margins": 0.9078100323677063, + "rewards/rejected": -3.494511842727661, + "sft_loss": 2.7249536514282227, + "step": 2685 + }, + { + "epoch": 1.439705636394046, + "grad_norm": 24.402269527036708, + "learning_rate": 6.210415403249993e-07, + "logits/chosen": -0.34314805269241333, + "logits/rejected": -0.03575535863637924, + "logps/chosen": -2.68799090385437, + "logps/rejected": -3.5274910926818848, + "loss": 0.5629, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": -2.68799090385437, + "rewards/margins": 0.8395000696182251, + "rewards/rejected": -3.5274910926818848, + "sft_loss": 2.7762532234191895, + "step": 2690 + }, + { + "epoch": 1.4423816691754474, + "grad_norm": 16.204604712741396, + "learning_rate": 6.195298770577415e-07, + "logits/chosen": -0.08360803127288818, + "logits/rejected": -0.07277211546897888, + "logps/chosen": -2.678548812866211, + "logps/rejected": -3.5122742652893066, + "loss": 0.5523, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -2.678548812866211, + "rewards/margins": 0.83372563123703, + "rewards/rejected": -3.5122742652893066, + "sft_loss": 2.7697131633758545, + "step": 2695 + }, + { + "epoch": 1.445057701956849, + "grad_norm": 10.0804571859839, + "learning_rate": 6.180170541110923e-07, + "logits/chosen": -0.16857047379016876, + "logits/rejected": 0.06029961258172989, + "logps/chosen": -2.7331137657165527, + "logps/rejected": -3.5940475463867188, + "loss": 0.5378, + "rewards/accuracies": 0.768750011920929, + "rewards/chosen": -2.7331137657165527, + "rewards/margins": 0.860933780670166, + "rewards/rejected": -3.5940475463867188, + "sft_loss": 2.9450149536132812, + "step": 2700 + }, + { + "epoch": 1.4477337347382506, + "grad_norm": 12.99917532888424, + "learning_rate": 6.165030861624663e-07, + "logits/chosen": -0.269326388835907, + "logits/rejected": 0.01531993132084608, + "logps/chosen": -2.7049174308776855, + "logps/rejected": -3.7752068042755127, + "loss": 0.4784, + "rewards/accuracies": 0.768750011920929, + "rewards/chosen": -2.7049174308776855, + "rewards/margins": 1.0702893733978271, + "rewards/rejected": -3.7752068042755127, + "sft_loss": 2.769993305206299, + "step": 2705 + }, + { + "epoch": 1.4504097675196521, + "grad_norm": 15.836204895419154, + "learning_rate": 6.149879879003876e-07, + "logits/chosen": -0.11991055309772491, + "logits/rejected": -0.10178259760141373, + "logps/chosen": -2.7728562355041504, + "logps/rejected": -3.674401044845581, + "loss": 0.5164, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -2.7728562355041504, + "rewards/margins": 0.9015445709228516, + "rewards/rejected": -3.674401044845581, + "sft_loss": 2.895869731903076, + "step": 2710 + }, + { + "epoch": 1.4530858003010536, + "grad_norm": 11.375428045803405, + "learning_rate": 6.13471774024346e-07, + "logits/chosen": -0.2894328534603119, + "logits/rejected": -0.14026543498039246, + "logps/chosen": -2.712986469268799, + "logps/rejected": -3.547203779220581, + "loss": 0.5218, + "rewards/accuracies": 0.768750011920929, + "rewards/chosen": -2.712986469268799, + "rewards/margins": 0.8342171907424927, + "rewards/rejected": -3.547203779220581, + "sft_loss": 2.893850326538086, + "step": 2715 + }, + { + "epoch": 1.4557618330824553, + "grad_norm": 12.298204703115006, + "learning_rate": 6.119544592446551e-07, + "logits/chosen": -0.2140885889530182, + "logits/rejected": -0.06003519147634506, + "logps/chosen": -2.8017475605010986, + "logps/rejected": -3.537752628326416, + "loss": 0.5506, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -2.8017475605010986, + "rewards/margins": 0.7360051870346069, + "rewards/rejected": -3.537752628326416, + "sft_loss": 2.888538122177124, + "step": 2720 + }, + { + "epoch": 1.4584378658638568, + "grad_norm": 12.601854566833609, + "learning_rate": 6.104360582823096e-07, + "logits/chosen": -0.17153920233249664, + "logits/rejected": -0.04675662890076637, + "logps/chosen": -2.7769007682800293, + "logps/rejected": -3.5771872997283936, + "loss": 0.5458, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -2.7769007682800293, + "rewards/margins": 0.8002867698669434, + "rewards/rejected": -3.5771872997283936, + "sft_loss": 2.9473347663879395, + "step": 2725 + }, + { + "epoch": 1.4611138986452583, + "grad_norm": 18.364600743337366, + "learning_rate": 6.089165858688423e-07, + "logits/chosen": -0.18823352456092834, + "logits/rejected": 0.02625465765595436, + "logps/chosen": -2.734922170639038, + "logps/rejected": -3.678147077560425, + "loss": 0.5376, + "rewards/accuracies": 0.71875, + "rewards/chosen": -2.734922170639038, + "rewards/margins": 0.9432247877120972, + "rewards/rejected": -3.678147077560425, + "sft_loss": 2.9276623725891113, + "step": 2730 + }, + { + "epoch": 1.46378993142666, + "grad_norm": 11.07782177923181, + "learning_rate": 6.073960567461811e-07, + "logits/chosen": -0.20135828852653503, + "logits/rejected": 0.026552096009254456, + "logps/chosen": -2.5619168281555176, + "logps/rejected": -3.5634264945983887, + "loss": 0.4779, + "rewards/accuracies": 0.8062499761581421, + "rewards/chosen": -2.5619168281555176, + "rewards/margins": 1.0015099048614502, + "rewards/rejected": -3.5634264945983887, + "sft_loss": 2.7775845527648926, + "step": 2735 + }, + { + "epoch": 1.4664659642080615, + "grad_norm": 13.098442571344702, + "learning_rate": 6.058744856665065e-07, + "logits/chosen": -0.2057342827320099, + "logits/rejected": -0.06957806646823883, + "logps/chosen": -2.756690502166748, + "logps/rejected": -3.7283108234405518, + "loss": 0.5006, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -2.756690502166748, + "rewards/margins": 0.9716199040412903, + "rewards/rejected": -3.7283108234405518, + "sft_loss": 2.9453227519989014, + "step": 2740 + }, + { + "epoch": 1.469141996989463, + "grad_norm": 14.17214298836207, + "learning_rate": 6.043518873921074e-07, + "logits/chosen": -0.2255820780992508, + "logits/rejected": -0.038780681788921356, + "logps/chosen": -2.7394678592681885, + "logps/rejected": -3.5877277851104736, + "loss": 0.5144, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -2.7394678592681885, + "rewards/margins": 0.8482600450515747, + "rewards/rejected": -3.5877277851104736, + "sft_loss": 2.8428146839141846, + "step": 2745 + }, + { + "epoch": 1.4718180297708647, + "grad_norm": 16.81159688729916, + "learning_rate": 6.028282766952393e-07, + "logits/chosen": -0.15135575830936432, + "logits/rejected": -0.013321302831172943, + "logps/chosen": -2.8603711128234863, + "logps/rejected": -3.800457000732422, + "loss": 0.5129, + "rewards/accuracies": 0.78125, + "rewards/chosen": -2.8603711128234863, + "rewards/margins": 0.9400860071182251, + "rewards/rejected": -3.800457000732422, + "sft_loss": 2.9740004539489746, + "step": 2750 + }, + { + "epoch": 1.4744940625522662, + "grad_norm": 22.76507570539442, + "learning_rate": 6.013036683579798e-07, + "logits/chosen": -0.10337056964635849, + "logits/rejected": 0.040584810078144073, + "logps/chosen": -2.8502213954925537, + "logps/rejected": -3.8350982666015625, + "loss": 0.5026, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -2.8502213954925537, + "rewards/margins": 0.9848769307136536, + "rewards/rejected": -3.8350982666015625, + "sft_loss": 3.0348498821258545, + "step": 2755 + }, + { + "epoch": 1.4771700953336677, + "grad_norm": 14.477476737366594, + "learning_rate": 5.997780771720854e-07, + "logits/chosen": -0.2488655149936676, + "logits/rejected": -0.011253480799496174, + "logps/chosen": -2.973107099533081, + "logps/rejected": -4.006959438323975, + "loss": 0.4991, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -2.973107099533081, + "rewards/margins": 1.033852458000183, + "rewards/rejected": -4.006959438323975, + "sft_loss": 3.1735317707061768, + "step": 2760 + }, + { + "epoch": 1.4798461281150694, + "grad_norm": 20.500259800881533, + "learning_rate": 5.982515179388486e-07, + "logits/chosen": -0.14365240931510925, + "logits/rejected": 0.004178575240075588, + "logps/chosen": -2.9276845455169678, + "logps/rejected": -3.8023598194122314, + "loss": 0.5415, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": -2.9276845455169678, + "rewards/margins": 0.8746751546859741, + "rewards/rejected": -3.8023598194122314, + "sft_loss": 3.1712915897369385, + "step": 2765 + }, + { + "epoch": 1.482522160896471, + "grad_norm": 14.336439014965194, + "learning_rate": 5.967240054689541e-07, + "logits/chosen": -0.2544417083263397, + "logits/rejected": -0.1564224660396576, + "logps/chosen": -2.945641040802002, + "logps/rejected": -3.751100540161133, + "loss": 0.5694, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -2.945641040802002, + "rewards/margins": 0.8054594993591309, + "rewards/rejected": -3.751100540161133, + "sft_loss": 3.140873670578003, + "step": 2770 + }, + { + "epoch": 1.4851981936778724, + "grad_norm": 16.712349793859868, + "learning_rate": 5.951955545823342e-07, + "logits/chosen": -0.14263644814491272, + "logits/rejected": -0.03438568860292435, + "logps/chosen": -3.0389950275421143, + "logps/rejected": -4.015830039978027, + "loss": 0.5333, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -3.0389950275421143, + "rewards/margins": 0.9768354296684265, + "rewards/rejected": -4.015830039978027, + "sft_loss": 3.184089422225952, + "step": 2775 + }, + { + "epoch": 1.4878742264592741, + "grad_norm": 13.43688652506777, + "learning_rate": 5.936661801080263e-07, + "logits/chosen": -0.14340347051620483, + "logits/rejected": -0.0075989714823663235, + "logps/chosen": -3.104412794113159, + "logps/rejected": -3.8261265754699707, + "loss": 0.6125, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -3.104412794113159, + "rewards/margins": 0.7217133641242981, + "rewards/rejected": -3.8261265754699707, + "sft_loss": 3.140293598175049, + "step": 2780 + }, + { + "epoch": 1.4905502592406756, + "grad_norm": 11.308275521719805, + "learning_rate": 5.92135896884028e-07, + "logits/chosen": -0.20025837421417236, + "logits/rejected": -0.013455281034111977, + "logps/chosen": -3.028421401977539, + "logps/rejected": -3.971764087677002, + "loss": 0.5263, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -3.028421401977539, + "rewards/margins": 0.9433425664901733, + "rewards/rejected": -3.971764087677002, + "sft_loss": 3.062842845916748, + "step": 2785 + }, + { + "epoch": 1.4932262920220774, + "grad_norm": 18.922995983184755, + "learning_rate": 5.906047197571541e-07, + "logits/chosen": -0.12423284351825714, + "logits/rejected": -0.1412658989429474, + "logps/chosen": -2.9017343521118164, + "logps/rejected": -3.7224926948547363, + "loss": 0.5747, + "rewards/accuracies": 0.71875, + "rewards/chosen": -2.9017343521118164, + "rewards/margins": 0.820758044719696, + "rewards/rejected": -3.7224926948547363, + "sft_loss": 3.1434712409973145, + "step": 2790 + }, + { + "epoch": 1.4959023248034788, + "grad_norm": 12.450582467267319, + "learning_rate": 5.890726635828919e-07, + "logits/chosen": -0.041972313076257706, + "logits/rejected": -0.03934203460812569, + "logps/chosen": -2.6451210975646973, + "logps/rejected": -3.56543231010437, + "loss": 0.5393, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -2.6451210975646973, + "rewards/margins": 0.9203113317489624, + "rewards/rejected": -3.56543231010437, + "sft_loss": 2.757323741912842, + "step": 2795 + }, + { + "epoch": 1.4985783575848803, + "grad_norm": 18.413613848092343, + "learning_rate": 5.875397432252569e-07, + "logits/chosen": -0.19127897918224335, + "logits/rejected": -0.10696186125278473, + "logps/chosen": -2.6481118202209473, + "logps/rejected": -3.434417247772217, + "loss": 0.5512, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -2.6481118202209473, + "rewards/margins": 0.7863054275512695, + "rewards/rejected": -3.434417247772217, + "sft_loss": 2.8000905513763428, + "step": 2800 + }, + { + "epoch": 1.4985783575848803, + "eval_logits/chosen": 0.1750119924545288, + "eval_logits/rejected": 0.28923624753952026, + "eval_logps/chosen": -2.7513301372528076, + "eval_logps/rejected": -3.5751004219055176, + "eval_loss": 0.5563209652900696, + "eval_rewards/accuracies": 0.7203264236450195, + "eval_rewards/chosen": -2.7513301372528076, + "eval_rewards/margins": 0.8237702250480652, + "eval_rewards/rejected": -3.5751004219055176, + "eval_runtime": 50.2195, + "eval_samples_per_second": 26.782, + "eval_sft_loss": 2.9053328037261963, + "eval_steps_per_second": 6.711, + "step": 2800 + }, + { + "epoch": 1.5012543903662818, + "grad_norm": 10.41761647965276, + "learning_rate": 5.860059735566491e-07, + "logits/chosen": -0.332925945520401, + "logits/rejected": -0.13850803673267365, + "logps/chosen": -2.547250270843506, + "logps/rejected": -3.3842644691467285, + "loss": 0.5283, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -2.547250270843506, + "rewards/margins": 0.837013840675354, + "rewards/rejected": -3.3842644691467285, + "sft_loss": 2.7107858657836914, + "step": 2805 + }, + { + "epoch": 1.5039304231476835, + "grad_norm": 15.905805367045609, + "learning_rate": 5.844713694577087e-07, + "logits/chosen": -0.169723242521286, + "logits/rejected": -0.06621355563402176, + "logps/chosen": -2.638108968734741, + "logps/rejected": -3.4682857990264893, + "loss": 0.5378, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": -2.638108968734741, + "rewards/margins": 0.8301769495010376, + "rewards/rejected": -3.4682857990264893, + "sft_loss": 2.8118090629577637, + "step": 2810 + }, + { + "epoch": 1.5066064559290853, + "grad_norm": 10.777745910778588, + "learning_rate": 5.829359458171714e-07, + "logits/chosen": -0.1262860745191574, + "logits/rejected": 0.008149102330207825, + "logps/chosen": -2.671485662460327, + "logps/rejected": -3.7072227001190186, + "loss": 0.4716, + "rewards/accuracies": 0.8062499761581421, + "rewards/chosen": -2.671485662460327, + "rewards/margins": 1.0357367992401123, + "rewards/rejected": -3.7072227001190186, + "sft_loss": 2.7742977142333984, + "step": 2815 + }, + { + "epoch": 1.5092824887104868, + "grad_norm": 13.279294927026276, + "learning_rate": 5.81399717531724e-07, + "logits/chosen": -0.21378269791603088, + "logits/rejected": 0.0385824516415596, + "logps/chosen": -2.8625640869140625, + "logps/rejected": -3.6599457263946533, + "loss": 0.5676, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -2.8625640869140625, + "rewards/margins": 0.7973818182945251, + "rewards/rejected": -3.6599457263946533, + "sft_loss": 3.028451919555664, + "step": 2820 + }, + { + "epoch": 1.5119585214918883, + "grad_norm": 13.266146592543238, + "learning_rate": 5.798626995058602e-07, + "logits/chosen": -0.2696044445037842, + "logits/rejected": -0.037425920367240906, + "logps/chosen": -2.9013805389404297, + "logps/rejected": -3.7950050830841064, + "loss": 0.5276, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -2.9013805389404297, + "rewards/margins": 0.893624484539032, + "rewards/rejected": -3.7950050830841064, + "sft_loss": 3.0186824798583984, + "step": 2825 + }, + { + "epoch": 1.51463455427329, + "grad_norm": 15.104956220264441, + "learning_rate": 5.783249066517354e-07, + "logits/chosen": -0.17140641808509827, + "logits/rejected": -0.01820288598537445, + "logps/chosen": -2.724626064300537, + "logps/rejected": -3.6410300731658936, + "loss": 0.5089, + "rewards/accuracies": 0.768750011920929, + "rewards/chosen": -2.724626064300537, + "rewards/margins": 0.9164039492607117, + "rewards/rejected": -3.6410300731658936, + "sft_loss": 2.8896212577819824, + "step": 2830 + }, + { + "epoch": 1.5173105870546915, + "grad_norm": 33.067123229566405, + "learning_rate": 5.767863538890228e-07, + "logits/chosen": -0.19309866428375244, + "logits/rejected": -0.01465575397014618, + "logps/chosen": -2.8873116970062256, + "logps/rejected": -3.890298843383789, + "loss": 0.5101, + "rewards/accuracies": 0.75, + "rewards/chosen": -2.8873116970062256, + "rewards/margins": 1.0029871463775635, + "rewards/rejected": -3.890298843383789, + "sft_loss": 3.0265347957611084, + "step": 2835 + }, + { + "epoch": 1.519986619836093, + "grad_norm": 16.68826289031432, + "learning_rate": 5.75247056144768e-07, + "logits/chosen": -0.18705160915851593, + "logits/rejected": -0.09919128566980362, + "logps/chosen": -2.9234955310821533, + "logps/rejected": -3.677717685699463, + "loss": 0.5891, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -2.9234955310821533, + "rewards/margins": 0.7542222738265991, + "rewards/rejected": -3.677717685699463, + "sft_loss": 3.084500789642334, + "step": 2840 + }, + { + "epoch": 1.5226626526174947, + "grad_norm": 18.399868318413944, + "learning_rate": 5.737070283532444e-07, + "logits/chosen": -0.14178113639354706, + "logits/rejected": -0.03312011808156967, + "logps/chosen": -2.937467336654663, + "logps/rejected": -3.7148735523223877, + "loss": 0.6212, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -2.937467336654663, + "rewards/margins": 0.7774060964584351, + "rewards/rejected": -3.7148735523223877, + "sft_loss": 2.9519081115722656, + "step": 2845 + }, + { + "epoch": 1.5253386853988962, + "grad_norm": 12.134442423216148, + "learning_rate": 5.721662854558084e-07, + "logits/chosen": -0.19238229095935822, + "logits/rejected": -0.06883934885263443, + "logps/chosen": -2.799609422683716, + "logps/rejected": -3.813912868499756, + "loss": 0.4851, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -2.799609422683716, + "rewards/margins": 1.0143041610717773, + "rewards/rejected": -3.813912868499756, + "sft_loss": 2.959176540374756, + "step": 2850 + }, + { + "epoch": 1.5280147181802977, + "grad_norm": 12.525962827150428, + "learning_rate": 5.706248424007545e-07, + "logits/chosen": -0.22883224487304688, + "logits/rejected": -0.01344398595392704, + "logps/chosen": -2.968219041824341, + "logps/rejected": -3.826395034790039, + "loss": 0.5412, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -2.968219041824341, + "rewards/margins": 0.8581761121749878, + "rewards/rejected": -3.826395034790039, + "sft_loss": 3.1020607948303223, + "step": 2855 + }, + { + "epoch": 1.5306907509616994, + "grad_norm": 14.910898523551785, + "learning_rate": 5.690827141431699e-07, + "logits/chosen": -0.2574513554573059, + "logits/rejected": -0.019537249580025673, + "logps/chosen": -2.833799123764038, + "logps/rejected": -3.614091157913208, + "loss": 0.5313, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -2.833799123764038, + "rewards/margins": 0.7802920341491699, + "rewards/rejected": -3.614091157913208, + "sft_loss": 2.9435155391693115, + "step": 2860 + }, + { + "epoch": 1.5333667837431009, + "grad_norm": 19.252025215234966, + "learning_rate": 5.675399156447897e-07, + "logits/chosen": -0.2882543206214905, + "logits/rejected": -0.1258959323167801, + "logps/chosen": -2.9176371097564697, + "logps/rejected": -3.582078456878662, + "loss": 0.5927, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -2.9176371097564697, + "rewards/margins": 0.6644415259361267, + "rewards/rejected": -3.582078456878662, + "sft_loss": 3.088707447052002, + "step": 2865 + }, + { + "epoch": 1.5360428165245024, + "grad_norm": 14.712122093475505, + "learning_rate": 5.659964618738515e-07, + "logits/chosen": -0.20669814944267273, + "logits/rejected": -0.06127766892313957, + "logps/chosen": -2.803532123565674, + "logps/rejected": -3.5541293621063232, + "loss": 0.5624, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -2.803532123565674, + "rewards/margins": 0.7505972981452942, + "rewards/rejected": -3.5541293621063232, + "sft_loss": 2.925236701965332, + "step": 2870 + }, + { + "epoch": 1.538718849305904, + "grad_norm": 17.12798243385786, + "learning_rate": 5.644523678049509e-07, + "logits/chosen": -0.2009466141462326, + "logits/rejected": -0.05111227184534073, + "logps/chosen": -2.793321132659912, + "logps/rejected": -3.577846050262451, + "loss": 0.5411, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -2.793321132659912, + "rewards/margins": 0.7845247983932495, + "rewards/rejected": -3.577846050262451, + "sft_loss": 2.881221055984497, + "step": 2875 + }, + { + "epoch": 1.5413948820873056, + "grad_norm": 15.641371082674475, + "learning_rate": 5.629076484188952e-07, + "logits/chosen": -0.05735234171152115, + "logits/rejected": 0.07924290746450424, + "logps/chosen": -2.6026575565338135, + "logps/rejected": -3.470081329345703, + "loss": 0.5192, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -2.6026575565338135, + "rewards/margins": 0.8674238324165344, + "rewards/rejected": -3.470081329345703, + "sft_loss": 2.746920108795166, + "step": 2880 + }, + { + "epoch": 1.544070914868707, + "grad_norm": 13.5763732852673, + "learning_rate": 5.613623187025587e-07, + "logits/chosen": -0.17112448811531067, + "logits/rejected": -0.008452496491372585, + "logps/chosen": -2.730259418487549, + "logps/rejected": -3.6526596546173096, + "loss": 0.5129, + "rewards/accuracies": 0.7562500238418579, + "rewards/chosen": -2.730259418487549, + "rewards/margins": 0.9224007725715637, + "rewards/rejected": -3.6526596546173096, + "sft_loss": 2.858396291732788, + "step": 2885 + }, + { + "epoch": 1.5467469476501088, + "grad_norm": 10.988273230358, + "learning_rate": 5.598163936487369e-07, + "logits/chosen": -0.2011490762233734, + "logits/rejected": 0.03283718600869179, + "logps/chosen": -2.794330358505249, + "logps/rejected": -3.887986660003662, + "loss": 0.4753, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -2.794330358505249, + "rewards/margins": 1.0936561822891235, + "rewards/rejected": -3.887986660003662, + "sft_loss": 2.8787245750427246, + "step": 2890 + }, + { + "epoch": 1.5494229804315103, + "grad_norm": 15.782631448647228, + "learning_rate": 5.582698882560017e-07, + "logits/chosen": -0.17405319213867188, + "logits/rejected": 0.021756382659077644, + "logps/chosen": -2.754683017730713, + "logps/rejected": -3.586590528488159, + "loss": 0.5615, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -2.754683017730713, + "rewards/margins": 0.8319074511528015, + "rewards/rejected": -3.586590528488159, + "sft_loss": 2.8560948371887207, + "step": 2895 + }, + { + "epoch": 1.5520990132129118, + "grad_norm": 11.187595581042258, + "learning_rate": 5.567228175285549e-07, + "logits/chosen": -0.11427092552185059, + "logits/rejected": 0.023462316021323204, + "logps/chosen": -2.862224578857422, + "logps/rejected": -3.868736743927002, + "loss": 0.4852, + "rewards/accuracies": 0.7562500238418579, + "rewards/chosen": -2.862224578857422, + "rewards/margins": 1.006511926651001, + "rewards/rejected": -3.868736743927002, + "sft_loss": 2.9960880279541016, + "step": 2900 + }, + { + "epoch": 1.5547750459943135, + "grad_norm": 15.811635282671729, + "learning_rate": 5.551751964760838e-07, + "logits/chosen": -0.0515236034989357, + "logits/rejected": -0.028668904677033424, + "logps/chosen": -2.864414930343628, + "logps/rejected": -3.809762954711914, + "loss": 0.5122, + "rewards/accuracies": 0.7562500238418579, + "rewards/chosen": -2.864414930343628, + "rewards/margins": 0.9453479647636414, + "rewards/rejected": -3.809762954711914, + "sft_loss": 3.0091872215270996, + "step": 2905 + }, + { + "epoch": 1.557451078775715, + "grad_norm": 19.555853889432004, + "learning_rate": 5.536270401136145e-07, + "logits/chosen": -0.17987671494483948, + "logits/rejected": -0.03466006740927696, + "logps/chosen": -2.9958698749542236, + "logps/rejected": -3.8351073265075684, + "loss": 0.5502, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": -2.9958698749542236, + "rewards/margins": 0.8392373323440552, + "rewards/rejected": -3.8351073265075684, + "sft_loss": 3.2206878662109375, + "step": 2910 + }, + { + "epoch": 1.5601271115571165, + "grad_norm": 19.875087782818508, + "learning_rate": 5.520783634613667e-07, + "logits/chosen": -0.11352671682834625, + "logits/rejected": 0.12970559298992157, + "logps/chosen": -2.9497323036193848, + "logps/rejected": -3.9853293895721436, + "loss": 0.5179, + "rewards/accuracies": 0.768750011920929, + "rewards/chosen": -2.9497323036193848, + "rewards/margins": 1.0355972051620483, + "rewards/rejected": -3.9853293895721436, + "sft_loss": 3.243316650390625, + "step": 2915 + }, + { + "epoch": 1.5628031443385182, + "grad_norm": 21.14982034068398, + "learning_rate": 5.505291815446082e-07, + "logits/chosen": -0.13817700743675232, + "logits/rejected": 0.0019467368256300688, + "logps/chosen": -3.080827236175537, + "logps/rejected": -4.003448009490967, + "loss": 0.5712, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -3.080827236175537, + "rewards/margins": 0.9226205945014954, + "rewards/rejected": -4.003448009490967, + "sft_loss": 3.302227020263672, + "step": 2920 + }, + { + "epoch": 1.5654791771199197, + "grad_norm": 16.15882869564911, + "learning_rate": 5.489795093935089e-07, + "logits/chosen": -0.11244082450866699, + "logits/rejected": 0.02112450823187828, + "logps/chosen": -2.877242088317871, + "logps/rejected": -3.7602596282958984, + "loss": 0.565, + "rewards/accuracies": 0.7562500238418579, + "rewards/chosen": -2.877242088317871, + "rewards/margins": 0.8830181360244751, + "rewards/rejected": -3.7602596282958984, + "sft_loss": 3.063002347946167, + "step": 2925 + }, + { + "epoch": 1.5681552099013212, + "grad_norm": 14.919029677524945, + "learning_rate": 5.474293620429946e-07, + "logits/chosen": -0.26579058170318604, + "logits/rejected": -0.04727768152952194, + "logps/chosen": -2.7573142051696777, + "logps/rejected": -3.9940857887268066, + "loss": 0.474, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -2.7573142051696777, + "rewards/margins": 1.2367714643478394, + "rewards/rejected": -3.9940857887268066, + "sft_loss": 3.0310394763946533, + "step": 2930 + }, + { + "epoch": 1.570831242682723, + "grad_norm": 16.447429484587722, + "learning_rate": 5.458787545326018e-07, + "logits/chosen": -0.21073214709758759, + "logits/rejected": -0.06848563253879547, + "logps/chosen": -2.97898530960083, + "logps/rejected": -3.864051103591919, + "loss": 0.5328, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -2.97898530960083, + "rewards/margins": 0.8850658535957336, + "rewards/rejected": -3.864051103591919, + "sft_loss": 3.101569652557373, + "step": 2935 + }, + { + "epoch": 1.5735072754641244, + "grad_norm": 15.746122226760354, + "learning_rate": 5.443277019063311e-07, + "logits/chosen": -0.251105397939682, + "logits/rejected": -0.05666235834360123, + "logps/chosen": -2.954221487045288, + "logps/rejected": -4.072968006134033, + "loss": 0.5307, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -2.954221487045288, + "rewards/margins": 1.118746280670166, + "rewards/rejected": -4.072968006134033, + "sft_loss": 3.1267192363739014, + "step": 2940 + }, + { + "epoch": 1.5761833082455259, + "grad_norm": 20.737025541581826, + "learning_rate": 5.427762192125023e-07, + "logits/chosen": -0.18963779509067535, + "logits/rejected": -0.04109461233019829, + "logps/chosen": -2.91569447517395, + "logps/rejected": -3.7815444469451904, + "loss": 0.5441, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -2.91569447517395, + "rewards/margins": 0.8658501505851746, + "rewards/rejected": -3.7815444469451904, + "sft_loss": 3.0128207206726074, + "step": 2945 + }, + { + "epoch": 1.5788593410269276, + "grad_norm": 18.26540381429684, + "learning_rate": 5.41224321503607e-07, + "logits/chosen": -0.18460258841514587, + "logits/rejected": 0.1022685170173645, + "logps/chosen": -2.8344292640686035, + "logps/rejected": -3.855309247970581, + "loss": 0.4855, + "rewards/accuracies": 0.793749988079071, + "rewards/chosen": -2.8344292640686035, + "rewards/margins": 1.020879864692688, + "rewards/rejected": -3.855309247970581, + "sft_loss": 2.9823155403137207, + "step": 2950 + }, + { + "epoch": 1.5815353738083293, + "grad_norm": 16.947184497240457, + "learning_rate": 5.396720238361637e-07, + "logits/chosen": -0.13508784770965576, + "logits/rejected": 0.007212462835013866, + "logps/chosen": -2.922914981842041, + "logps/rejected": -3.78143310546875, + "loss": 0.5484, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -2.922914981842041, + "rewards/margins": 0.8585184216499329, + "rewards/rejected": -3.78143310546875, + "sft_loss": 3.1644444465637207, + "step": 2955 + }, + { + "epoch": 1.5842114065897306, + "grad_norm": 10.8241391582058, + "learning_rate": 5.381193412705711e-07, + "logits/chosen": -0.258370041847229, + "logits/rejected": -0.08109404146671295, + "logps/chosen": -2.8407623767852783, + "logps/rejected": -3.7270267009735107, + "loss": 0.5045, + "rewards/accuracies": 0.8062499761581421, + "rewards/chosen": -2.8407623767852783, + "rewards/margins": 0.8862646818161011, + "rewards/rejected": -3.7270267009735107, + "sft_loss": 2.9842042922973633, + "step": 2960 + }, + { + "epoch": 1.5868874393711323, + "grad_norm": 12.363580477888307, + "learning_rate": 5.365662888709622e-07, + "logits/chosen": -0.2032860517501831, + "logits/rejected": -0.04430514574050903, + "logps/chosen": -2.98106050491333, + "logps/rejected": -4.009333610534668, + "loss": 0.5141, + "rewards/accuracies": 0.768750011920929, + "rewards/chosen": -2.98106050491333, + "rewards/margins": 1.0282728672027588, + "rewards/rejected": -4.009333610534668, + "sft_loss": 3.1617591381073, + "step": 2965 + }, + { + "epoch": 1.589563472152534, + "grad_norm": 23.654763880923753, + "learning_rate": 5.350128817050585e-07, + "logits/chosen": -0.2287582904100418, + "logits/rejected": -0.0005988016491755843, + "logps/chosen": -3.0337624549865723, + "logps/rejected": -3.9518356323242188, + "loss": 0.5504, + "rewards/accuracies": 0.768750011920929, + "rewards/chosen": -3.0337624549865723, + "rewards/margins": 0.9180733561515808, + "rewards/rejected": -3.9518356323242188, + "sft_loss": 3.1794168949127197, + "step": 2970 + }, + { + "epoch": 1.5922395049339353, + "grad_norm": 18.4826698939958, + "learning_rate": 5.334591348440229e-07, + "logits/chosen": -0.16902866959571838, + "logits/rejected": 0.02397763356566429, + "logps/chosen": -2.8789451122283936, + "logps/rejected": -3.7171578407287598, + "loss": 0.5406, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -2.8789451122283936, + "rewards/margins": 0.8382126688957214, + "rewards/rejected": -3.7171578407287598, + "sft_loss": 3.027923107147217, + "step": 2975 + }, + { + "epoch": 1.594915537715337, + "grad_norm": 12.616492772128158, + "learning_rate": 5.319050633623141e-07, + "logits/chosen": -0.22738048434257507, + "logits/rejected": -0.019261473789811134, + "logps/chosen": -2.9477336406707764, + "logps/rejected": -3.7788281440734863, + "loss": 0.5202, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -2.9477336406707764, + "rewards/margins": 0.8310942649841309, + "rewards/rejected": -3.7788281440734863, + "sft_loss": 3.102329730987549, + "step": 2980 + }, + { + "epoch": 1.5975915704967387, + "grad_norm": 17.002375245551736, + "learning_rate": 5.303506823375409e-07, + "logits/chosen": -0.22807928919792175, + "logits/rejected": 0.0325554795563221, + "logps/chosen": -2.9772753715515137, + "logps/rejected": -4.074067115783691, + "loss": 0.5154, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -2.9772753715515137, + "rewards/margins": 1.0967915058135986, + "rewards/rejected": -4.074067115783691, + "sft_loss": 3.0923879146575928, + "step": 2985 + }, + { + "epoch": 1.60026760327814, + "grad_norm": 13.948824541303509, + "learning_rate": 5.287960068503143e-07, + "logits/chosen": -0.2504068613052368, + "logits/rejected": 0.0015707932179793715, + "logps/chosen": -2.912179470062256, + "logps/rejected": -3.9256370067596436, + "loss": 0.5103, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -2.912179470062256, + "rewards/margins": 1.0134575366973877, + "rewards/rejected": -3.9256370067596436, + "sft_loss": 3.058387517929077, + "step": 2990 + }, + { + "epoch": 1.6029436360595417, + "grad_norm": 17.29165259872914, + "learning_rate": 5.272410519841032e-07, + "logits/chosen": -0.14839215576648712, + "logits/rejected": 0.014627779833972454, + "logps/chosen": -3.000635862350464, + "logps/rejected": -4.1716814041137695, + "loss": 0.4851, + "rewards/accuracies": 0.793749988079071, + "rewards/chosen": -3.000635862350464, + "rewards/margins": 1.1710458993911743, + "rewards/rejected": -4.1716814041137695, + "sft_loss": 3.2312474250793457, + "step": 2995 + }, + { + "epoch": 1.6056196688409434, + "grad_norm": 11.933730730663703, + "learning_rate": 5.256858328250861e-07, + "logits/chosen": -0.20798341929912567, + "logits/rejected": 0.023721005767583847, + "logps/chosen": -2.970376968383789, + "logps/rejected": -3.7593460083007812, + "loss": 0.5722, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -2.970376968383789, + "rewards/margins": 0.7889685034751892, + "rewards/rejected": -3.7593460083007812, + "sft_loss": 3.0607645511627197, + "step": 3000 + }, + { + "epoch": 1.608295701622345, + "grad_norm": 28.095890549973145, + "learning_rate": 5.241303644620063e-07, + "logits/chosen": -0.30875033140182495, + "logits/rejected": -0.09672629833221436, + "logps/chosen": -2.978696823120117, + "logps/rejected": -3.6481051445007324, + "loss": 0.6136, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -2.978696823120117, + "rewards/margins": 0.6694087982177734, + "rewards/rejected": -3.6481051445007324, + "sft_loss": 3.108910083770752, + "step": 3005 + }, + { + "epoch": 1.6109717344037464, + "grad_norm": 19.960336015122536, + "learning_rate": 5.225746619860248e-07, + "logits/chosen": -0.25689318776130676, + "logits/rejected": -0.09589236229658127, + "logps/chosen": -2.924680709838867, + "logps/rejected": -3.665656328201294, + "loss": 0.6023, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -2.924680709838867, + "rewards/margins": 0.7409757375717163, + "rewards/rejected": -3.665656328201294, + "sft_loss": 3.052212953567505, + "step": 3010 + }, + { + "epoch": 1.6136477671851481, + "grad_norm": 19.465256166030574, + "learning_rate": 5.210187404905735e-07, + "logits/chosen": -0.03861024230718613, + "logits/rejected": 0.05268191546201706, + "logps/chosen": -2.995497226715088, + "logps/rejected": -3.8167786598205566, + "loss": 0.5544, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -2.995497226715088, + "rewards/margins": 0.8212817311286926, + "rewards/rejected": -3.8167786598205566, + "sft_loss": 3.1183435916900635, + "step": 3015 + }, + { + "epoch": 1.6163237999665496, + "grad_norm": 14.179972871464367, + "learning_rate": 5.194626150712098e-07, + "logits/chosen": -0.22885003685951233, + "logits/rejected": -0.08128555119037628, + "logps/chosen": -2.917171001434326, + "logps/rejected": -3.7479748725891113, + "loss": 0.5307, + "rewards/accuracies": 0.768750011920929, + "rewards/chosen": -2.917171001434326, + "rewards/margins": 0.8308032751083374, + "rewards/rejected": -3.7479748725891113, + "sft_loss": 3.0888686180114746, + "step": 3020 + }, + { + "epoch": 1.6189998327479511, + "grad_norm": 13.640509203922976, + "learning_rate": 5.179063008254695e-07, + "logits/chosen": -0.21364569664001465, + "logits/rejected": -0.00813610665500164, + "logps/chosen": -2.7993874549865723, + "logps/rejected": -3.598403215408325, + "loss": 0.561, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": -2.7993874549865723, + "rewards/margins": 0.799015998840332, + "rewards/rejected": -3.598403215408325, + "sft_loss": 2.9994029998779297, + "step": 3025 + }, + { + "epoch": 1.6216758655293528, + "grad_norm": 14.4445070123789, + "learning_rate": 5.163498128527199e-07, + "logits/chosen": -0.15052303671836853, + "logits/rejected": 0.02527775429189205, + "logps/chosen": -2.982490301132202, + "logps/rejected": -3.844486951828003, + "loss": 0.5513, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -2.982490301132202, + "rewards/margins": 0.8619967699050903, + "rewards/rejected": -3.844486951828003, + "sft_loss": 3.0802769660949707, + "step": 3030 + }, + { + "epoch": 1.6243518983107543, + "grad_norm": 15.812415149593381, + "learning_rate": 5.147931662540144e-07, + "logits/chosen": -0.06317378580570221, + "logits/rejected": 0.1144840270280838, + "logps/chosen": -2.8584022521972656, + "logps/rejected": -3.5950756072998047, + "loss": 0.5475, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": -2.8584022521972656, + "rewards/margins": 0.7366732954978943, + "rewards/rejected": -3.5950756072998047, + "sft_loss": 3.009364366531372, + "step": 3035 + }, + { + "epoch": 1.6270279310921558, + "grad_norm": 11.468441941469427, + "learning_rate": 5.132363761319449e-07, + "logits/chosen": -0.1439296305179596, + "logits/rejected": -0.049841322004795074, + "logps/chosen": -2.809378147125244, + "logps/rejected": -3.8419647216796875, + "loss": 0.5151, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -2.809378147125244, + "rewards/margins": 1.0325868129730225, + "rewards/rejected": -3.8419647216796875, + "sft_loss": 2.957885265350342, + "step": 3040 + }, + { + "epoch": 1.6297039638735575, + "grad_norm": 34.802718071815235, + "learning_rate": 5.116794575904962e-07, + "logits/chosen": -0.13249622285366058, + "logits/rejected": -0.0025797567795962095, + "logps/chosen": -2.711029529571533, + "logps/rejected": -3.486485004425049, + "loss": 0.5684, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -2.711029529571533, + "rewards/margins": 0.7754554748535156, + "rewards/rejected": -3.486485004425049, + "sft_loss": 2.8514134883880615, + "step": 3045 + }, + { + "epoch": 1.632379996654959, + "grad_norm": 10.892875604035309, + "learning_rate": 5.101224257348987e-07, + "logits/chosen": -0.15007975697517395, + "logits/rejected": 0.019764890894293785, + "logps/chosen": -2.878706455230713, + "logps/rejected": -3.9150726795196533, + "loss": 0.4774, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -2.878706455230713, + "rewards/margins": 1.0363662242889404, + "rewards/rejected": -3.9150726795196533, + "sft_loss": 3.086242198944092, + "step": 3050 + }, + { + "epoch": 1.6350560294363605, + "grad_norm": 14.483283525837482, + "learning_rate": 5.085652956714823e-07, + "logits/chosen": -0.21432361006736755, + "logits/rejected": -0.02309691347181797, + "logps/chosen": -2.9688711166381836, + "logps/rejected": -3.8170647621154785, + "loss": 0.5388, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -2.9688711166381836, + "rewards/margins": 0.8481934666633606, + "rewards/rejected": -3.8170647621154785, + "sft_loss": 3.095407009124756, + "step": 3055 + }, + { + "epoch": 1.6377320622177622, + "grad_norm": 13.001074946425849, + "learning_rate": 5.070080825075298e-07, + "logits/chosen": -0.1922428458929062, + "logits/rejected": 0.04582958295941353, + "logps/chosen": -2.91438364982605, + "logps/rejected": -3.7429771423339844, + "loss": 0.5892, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -2.91438364982605, + "rewards/margins": 0.8285935521125793, + "rewards/rejected": -3.7429771423339844, + "sft_loss": 3.1438403129577637, + "step": 3060 + }, + { + "epoch": 1.6404080949991637, + "grad_norm": 13.995641813546811, + "learning_rate": 5.0545080135113e-07, + "logits/chosen": -0.13181616365909576, + "logits/rejected": -0.02712242677807808, + "logps/chosen": -2.84912371635437, + "logps/rejected": -3.7584915161132812, + "loss": 0.5505, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -2.84912371635437, + "rewards/margins": 0.909367561340332, + "rewards/rejected": -3.7584915161132812, + "sft_loss": 3.021531343460083, + "step": 3065 + }, + { + "epoch": 1.6430841277805652, + "grad_norm": 18.420291032139474, + "learning_rate": 5.038934673110316e-07, + "logits/chosen": -0.20530517399311066, + "logits/rejected": -0.06820012629032135, + "logps/chosen": -2.9428317546844482, + "logps/rejected": -3.816471815109253, + "loss": 0.5638, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -2.9428317546844482, + "rewards/margins": 0.8736408352851868, + "rewards/rejected": -3.816471815109253, + "sft_loss": 3.122080087661743, + "step": 3070 + }, + { + "epoch": 1.645760160561967, + "grad_norm": 12.210492099926979, + "learning_rate": 5.023360954964963e-07, + "logits/chosen": -0.2356480062007904, + "logits/rejected": -0.13236093521118164, + "logps/chosen": -2.6807234287261963, + "logps/rejected": -3.5748507976531982, + "loss": 0.4923, + "rewards/accuracies": 0.768750011920929, + "rewards/chosen": -2.6807234287261963, + "rewards/margins": 0.8941277265548706, + "rewards/rejected": -3.5748507976531982, + "sft_loss": 2.826641082763672, + "step": 3075 + }, + { + "epoch": 1.6484361933433684, + "grad_norm": 14.451314916093441, + "learning_rate": 5.007787010171524e-07, + "logits/chosen": -0.30715638399124146, + "logits/rejected": -0.0917990654706955, + "logps/chosen": -2.5948257446289062, + "logps/rejected": -3.5277962684631348, + "loss": 0.4782, + "rewards/accuracies": 0.8374999761581421, + "rewards/chosen": -2.5948257446289062, + "rewards/margins": 0.932970404624939, + "rewards/rejected": -3.5277962684631348, + "sft_loss": 2.8153226375579834, + "step": 3080 + }, + { + "epoch": 1.65111222612477, + "grad_norm": 16.093189988405726, + "learning_rate": 4.992212989828477e-07, + "logits/chosen": -0.12263667583465576, + "logits/rejected": -0.08358993381261826, + "logps/chosen": -2.74635648727417, + "logps/rejected": -3.4704620838165283, + "loss": 0.5703, + "rewards/accuracies": 0.7562500238418579, + "rewards/chosen": -2.74635648727417, + "rewards/margins": 0.7241055369377136, + "rewards/rejected": -3.4704620838165283, + "sft_loss": 2.9041481018066406, + "step": 3085 + }, + { + "epoch": 1.6537882589061716, + "grad_norm": 14.303938596204755, + "learning_rate": 4.976639045035036e-07, + "logits/chosen": -0.10676582902669907, + "logits/rejected": -0.01227900106459856, + "logps/chosen": -2.698171854019165, + "logps/rejected": -3.3960328102111816, + "loss": 0.6148, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -2.698171854019165, + "rewards/margins": 0.697860836982727, + "rewards/rejected": -3.3960328102111816, + "sft_loss": 2.8922371864318848, + "step": 3090 + }, + { + "epoch": 1.6564642916875731, + "grad_norm": 12.995678732494168, + "learning_rate": 4.961065326889683e-07, + "logits/chosen": -0.13217754662036896, + "logits/rejected": 0.052339475601911545, + "logps/chosen": -2.795322895050049, + "logps/rejected": -3.633164644241333, + "loss": 0.5317, + "rewards/accuracies": 0.768750011920929, + "rewards/chosen": -2.795322895050049, + "rewards/margins": 0.8378413915634155, + "rewards/rejected": -3.633164644241333, + "sft_loss": 2.9511818885803223, + "step": 3095 + }, + { + "epoch": 1.6591403244689746, + "grad_norm": 16.08044230088277, + "learning_rate": 4.9454919864887e-07, + "logits/chosen": -0.255813330411911, + "logits/rejected": -0.1016959398984909, + "logps/chosen": -2.697873592376709, + "logps/rejected": -3.5687255859375, + "loss": 0.5228, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": -2.697873592376709, + "rewards/margins": 0.8708522915840149, + "rewards/rejected": -3.5687255859375, + "sft_loss": 2.96101975440979, + "step": 3100 + }, + { + "epoch": 1.6618163572503764, + "grad_norm": 19.529351787541103, + "learning_rate": 4.929919174924701e-07, + "logits/chosen": -0.26780134439468384, + "logits/rejected": -0.045740462839603424, + "logps/chosen": -2.713799238204956, + "logps/rejected": -3.4403533935546875, + "loss": 0.5532, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -2.713799238204956, + "rewards/margins": 0.7265541553497314, + "rewards/rejected": -3.4403533935546875, + "sft_loss": 2.929337501525879, + "step": 3105 + }, + { + "epoch": 1.6644923900317778, + "grad_norm": 12.31980115365111, + "learning_rate": 4.914347043285177e-07, + "logits/chosen": -0.15741178393363953, + "logits/rejected": -0.007818855345249176, + "logps/chosen": -2.7308452129364014, + "logps/rejected": -3.5612876415252686, + "loss": 0.5259, + "rewards/accuracies": 0.768750011920929, + "rewards/chosen": -2.7308452129364014, + "rewards/margins": 0.8304422497749329, + "rewards/rejected": -3.5612876415252686, + "sft_loss": 2.8026375770568848, + "step": 3110 + }, + { + "epoch": 1.6671684228131793, + "grad_norm": 13.205443574074472, + "learning_rate": 4.898775742651013e-07, + "logits/chosen": -0.1403086930513382, + "logits/rejected": 0.01723460480570793, + "logps/chosen": -2.7844042778015137, + "logps/rejected": -3.7625949382781982, + "loss": 0.4815, + "rewards/accuracies": 0.793749988079071, + "rewards/chosen": -2.7844042778015137, + "rewards/margins": 0.9781904220581055, + "rewards/rejected": -3.7625949382781982, + "sft_loss": 3.0160343647003174, + "step": 3115 + }, + { + "epoch": 1.669844455594581, + "grad_norm": 11.706451190194656, + "learning_rate": 4.883205424095037e-07, + "logits/chosen": -0.27177152037620544, + "logits/rejected": -0.07603542506694794, + "logps/chosen": -2.9312081336975098, + "logps/rejected": -3.8443360328674316, + "loss": 0.5256, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -2.9312081336975098, + "rewards/margins": 0.9131277203559875, + "rewards/rejected": -3.8443360328674316, + "sft_loss": 3.074876308441162, + "step": 3120 + }, + { + "epoch": 1.6725204883759828, + "grad_norm": 13.913970753690924, + "learning_rate": 4.86763623868055e-07, + "logits/chosen": -0.184868723154068, + "logits/rejected": -0.03205538168549538, + "logps/chosen": -2.962169885635376, + "logps/rejected": -3.8724663257598877, + "loss": 0.5351, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -2.962169885635376, + "rewards/margins": 0.910296618938446, + "rewards/rejected": -3.8724663257598877, + "sft_loss": 3.0643038749694824, + "step": 3125 + }, + { + "epoch": 1.675196521157384, + "grad_norm": 13.159797786943738, + "learning_rate": 4.852068337459856e-07, + "logits/chosen": -0.12268374860286713, + "logits/rejected": 0.09164348989725113, + "logps/chosen": -3.1019387245178223, + "logps/rejected": -3.940359592437744, + "loss": 0.5308, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -3.1019387245178223, + "rewards/margins": 0.8384206891059875, + "rewards/rejected": -3.940359592437744, + "sft_loss": 3.2816097736358643, + "step": 3130 + }, + { + "epoch": 1.6778725539387858, + "grad_norm": 15.4563527851753, + "learning_rate": 4.8365018714728e-07, + "logits/chosen": -0.11665117740631104, + "logits/rejected": -0.003966017626225948, + "logps/chosen": -3.2221763134002686, + "logps/rejected": -3.9856514930725098, + "loss": 0.5706, + "rewards/accuracies": 0.71875, + "rewards/chosen": -3.2221763134002686, + "rewards/margins": 0.763475239276886, + "rewards/rejected": -3.9856514930725098, + "sft_loss": 3.3253014087677, + "step": 3135 + }, + { + "epoch": 1.6805485867201875, + "grad_norm": 17.822799045657792, + "learning_rate": 4.820936991745304e-07, + "logits/chosen": -0.3635734021663666, + "logits/rejected": -0.2108539342880249, + "logps/chosen": -2.9746642112731934, + "logps/rejected": -3.8151726722717285, + "loss": 0.5206, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -2.9746642112731934, + "rewards/margins": 0.8405085802078247, + "rewards/rejected": -3.8151726722717285, + "sft_loss": 3.126021146774292, + "step": 3140 + }, + { + "epoch": 1.6832246195015887, + "grad_norm": 13.688291338768908, + "learning_rate": 4.8053738492879e-07, + "logits/chosen": -0.15512339770793915, + "logits/rejected": 0.007311081979423761, + "logps/chosen": -2.8693430423736572, + "logps/rejected": -3.8841490745544434, + "loss": 0.522, + "rewards/accuracies": 0.768750011920929, + "rewards/chosen": -2.8693430423736572, + "rewards/margins": 1.0148061513900757, + "rewards/rejected": -3.8841490745544434, + "sft_loss": 2.981611967086792, + "step": 3145 + }, + { + "epoch": 1.6859006522829905, + "grad_norm": 13.620392772007163, + "learning_rate": 4.789812595094265e-07, + "logits/chosen": -0.2888887822628021, + "logits/rejected": -0.1388740837574005, + "logps/chosen": -2.93782377243042, + "logps/rejected": -3.9592788219451904, + "loss": 0.4875, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -2.93782377243042, + "rewards/margins": 1.0214550495147705, + "rewards/rejected": -3.9592788219451904, + "sft_loss": 3.0410799980163574, + "step": 3150 + }, + { + "epoch": 1.6885766850643922, + "grad_norm": 12.646316434762811, + "learning_rate": 4.774253380139752e-07, + "logits/chosen": -0.3319533169269562, + "logits/rejected": -0.17273344099521637, + "logps/chosen": -2.779099702835083, + "logps/rejected": -3.8044273853302, + "loss": 0.4882, + "rewards/accuracies": 0.78125, + "rewards/chosen": -2.779099702835083, + "rewards/margins": 1.0253279209136963, + "rewards/rejected": -3.8044273853302, + "sft_loss": 2.975861072540283, + "step": 3155 + }, + { + "epoch": 1.6912527178457935, + "grad_norm": 16.896373974454246, + "learning_rate": 4.758696355379936e-07, + "logits/chosen": -0.15754546225070953, + "logits/rejected": -0.19510313868522644, + "logps/chosen": -2.8744874000549316, + "logps/rejected": -3.818066358566284, + "loss": 0.5135, + "rewards/accuracies": 0.8187500238418579, + "rewards/chosen": -2.8744874000549316, + "rewards/margins": 0.9435788989067078, + "rewards/rejected": -3.818066358566284, + "sft_loss": 3.1294708251953125, + "step": 3160 + }, + { + "epoch": 1.6939287506271952, + "grad_norm": 15.788673901400218, + "learning_rate": 4.743141671749138e-07, + "logits/chosen": -0.3557151257991791, + "logits/rejected": -0.18027469515800476, + "logps/chosen": -3.043041706085205, + "logps/rejected": -3.759451389312744, + "loss": 0.6138, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -3.043041706085205, + "rewards/margins": 0.7164098024368286, + "rewards/rejected": -3.759451389312744, + "sft_loss": 3.246708393096924, + "step": 3165 + }, + { + "epoch": 1.6966047834085969, + "grad_norm": 14.237283392007136, + "learning_rate": 4.727589480158968e-07, + "logits/chosen": -0.2369193732738495, + "logits/rejected": -0.11112294346094131, + "logps/chosen": -2.933332920074463, + "logps/rejected": -3.9403202533721924, + "loss": 0.4988, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -2.933332920074463, + "rewards/margins": 1.0069873332977295, + "rewards/rejected": -3.9403202533721924, + "sft_loss": 3.1049280166625977, + "step": 3170 + }, + { + "epoch": 1.6992808161899984, + "grad_norm": 19.168604149317066, + "learning_rate": 4.712039931496855e-07, + "logits/chosen": -0.2870144844055176, + "logits/rejected": -0.11162833869457245, + "logps/chosen": -3.0423762798309326, + "logps/rejected": -3.660609722137451, + "loss": 0.6636, + "rewards/accuracies": 0.6875, + "rewards/chosen": -3.0423762798309326, + "rewards/margins": 0.6182333827018738, + "rewards/rejected": -3.660609722137451, + "sft_loss": 3.242758274078369, + "step": 3175 + }, + { + "epoch": 1.7019568489713999, + "grad_norm": 17.257914139514913, + "learning_rate": 4.6964931766245905e-07, + "logits/chosen": -0.1425977647304535, + "logits/rejected": -0.0717199370265007, + "logps/chosen": -2.940380573272705, + "logps/rejected": -3.841813564300537, + "loss": 0.5339, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -2.940380573272705, + "rewards/margins": 0.9014331102371216, + "rewards/rejected": -3.841813564300537, + "sft_loss": 3.0481081008911133, + "step": 3180 + }, + { + "epoch": 1.7046328817528016, + "grad_norm": 15.206261455382656, + "learning_rate": 4.6809493663768575e-07, + "logits/chosen": -0.1936657726764679, + "logits/rejected": -0.12913763523101807, + "logps/chosen": -2.8649189472198486, + "logps/rejected": -3.4266788959503174, + "loss": 0.6247, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -2.8649189472198486, + "rewards/margins": 0.5617601275444031, + "rewards/rejected": -3.4266788959503174, + "sft_loss": 3.061222553253174, + "step": 3185 + }, + { + "epoch": 1.707308914534203, + "grad_norm": 12.229514415619054, + "learning_rate": 4.6654086515597716e-07, + "logits/chosen": -0.3212862014770508, + "logits/rejected": -0.11561963707208633, + "logps/chosen": -2.8837294578552246, + "logps/rejected": -3.8425660133361816, + "loss": 0.4869, + "rewards/accuracies": 0.8062499761581421, + "rewards/chosen": -2.8837294578552246, + "rewards/margins": 0.9588366746902466, + "rewards/rejected": -3.8425660133361816, + "sft_loss": 3.031167507171631, + "step": 3190 + }, + { + "epoch": 1.7099849473156046, + "grad_norm": 12.193657302291385, + "learning_rate": 4.6498711829494154e-07, + "logits/chosen": -0.30257949233055115, + "logits/rejected": -0.16699732840061188, + "logps/chosen": -2.770297050476074, + "logps/rejected": -3.7176918983459473, + "loss": 0.5174, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -2.770297050476074, + "rewards/margins": 0.9473945498466492, + "rewards/rejected": -3.7176918983459473, + "sft_loss": 2.859004259109497, + "step": 3195 + }, + { + "epoch": 1.7126609800970063, + "grad_norm": 16.39615605105048, + "learning_rate": 4.6343371112903777e-07, + "logits/chosen": -0.18592138588428497, + "logits/rejected": -0.01131738256663084, + "logps/chosen": -2.9315178394317627, + "logps/rejected": -3.722820281982422, + "loss": 0.5766, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -2.9315178394317627, + "rewards/margins": 0.7913025617599487, + "rewards/rejected": -3.722820281982422, + "sft_loss": 3.017145872116089, + "step": 3200 + }, + { + "epoch": 1.7126609800970063, + "eval_logits/chosen": 0.15615563094615936, + "eval_logits/rejected": 0.267660915851593, + "eval_logps/chosen": -2.8133504390716553, + "eval_logps/rejected": -3.665544271469116, + "eval_loss": 0.551977813243866, + "eval_rewards/accuracies": 0.7262611389160156, + "eval_rewards/chosen": -2.8133504390716553, + "eval_rewards/margins": 0.8521937727928162, + "eval_rewards/rejected": -3.665544271469116, + "eval_runtime": 50.0415, + "eval_samples_per_second": 26.878, + "eval_sft_loss": 2.964270830154419, + "eval_steps_per_second": 6.734, + "step": 3200 + }, + { + "epoch": 1.7153370128784078, + "grad_norm": 11.957138109955915, + "learning_rate": 4.618806587294291e-07, + "logits/chosen": -0.33529722690582275, + "logits/rejected": -0.19825676083564758, + "logps/chosen": -2.708866596221924, + "logps/rejected": -3.683584213256836, + "loss": 0.4965, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -2.708866596221924, + "rewards/margins": 0.9747177958488464, + "rewards/rejected": -3.683584213256836, + "sft_loss": 2.8329052925109863, + "step": 3205 + }, + { + "epoch": 1.7180130456598093, + "grad_norm": 18.133437006125078, + "learning_rate": 4.603279761638365e-07, + "logits/chosen": -0.3051992952823639, + "logits/rejected": -0.1710871011018753, + "logps/chosen": -2.7860054969787598, + "logps/rejected": -3.4999566078186035, + "loss": 0.5994, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -2.7860054969787598, + "rewards/margins": 0.7139514684677124, + "rewards/rejected": -3.4999566078186035, + "sft_loss": 2.8975796699523926, + "step": 3210 + }, + { + "epoch": 1.720689078441211, + "grad_norm": 14.432897472663257, + "learning_rate": 4.5877567849639315e-07, + "logits/chosen": -0.21736936271190643, + "logits/rejected": -0.09302875399589539, + "logps/chosen": -2.8191990852355957, + "logps/rejected": -3.7241218090057373, + "loss": 0.524, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -2.8191990852355957, + "rewards/margins": 0.9049233198165894, + "rewards/rejected": -3.7241218090057373, + "sft_loss": 2.898805618286133, + "step": 3215 + }, + { + "epoch": 1.7233651112226125, + "grad_norm": 13.15421311840138, + "learning_rate": 4.572237807874979e-07, + "logits/chosen": -0.28606656193733215, + "logits/rejected": 0.012532521970570087, + "logps/chosen": -3.0481009483337402, + "logps/rejected": -3.9428951740264893, + "loss": 0.5897, + "rewards/accuracies": 0.71875, + "rewards/chosen": -3.0481009483337402, + "rewards/margins": 0.8947939872741699, + "rewards/rejected": -3.9428951740264893, + "sft_loss": 3.056109666824341, + "step": 3220 + }, + { + "epoch": 1.726041144004014, + "grad_norm": 14.062265382351862, + "learning_rate": 4.5567229809366895e-07, + "logits/chosen": -0.2212449610233307, + "logits/rejected": -0.07261637598276138, + "logps/chosen": -2.7670116424560547, + "logps/rejected": -3.6225428581237793, + "loss": 0.5298, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -2.7670116424560547, + "rewards/margins": 0.8555313348770142, + "rewards/rejected": -3.6225428581237793, + "sft_loss": 2.9393343925476074, + "step": 3225 + }, + { + "epoch": 1.7287171767854157, + "grad_norm": 21.977791827793503, + "learning_rate": 4.541212454673984e-07, + "logits/chosen": -0.2686312794685364, + "logits/rejected": -0.08200834691524506, + "logps/chosen": -2.868399143218994, + "logps/rejected": -4.024724006652832, + "loss": 0.4992, + "rewards/accuracies": 0.7562500238418579, + "rewards/chosen": -2.868399143218994, + "rewards/margins": 1.156325101852417, + "rewards/rejected": -4.024724006652832, + "sft_loss": 2.982067584991455, + "step": 3230 + }, + { + "epoch": 1.7313932095668172, + "grad_norm": 13.672277376114732, + "learning_rate": 4.525706379570055e-07, + "logits/chosen": -0.2825610041618347, + "logits/rejected": -0.16836941242218018, + "logps/chosen": -2.8134896755218506, + "logps/rejected": -3.7119224071502686, + "loss": 0.5161, + "rewards/accuracies": 0.768750011920929, + "rewards/chosen": -2.8134896755218506, + "rewards/margins": 0.8984330296516418, + "rewards/rejected": -3.7119224071502686, + "sft_loss": 2.921664237976074, + "step": 3235 + }, + { + "epoch": 1.7340692423482187, + "grad_norm": 14.546138304989343, + "learning_rate": 4.510204906064911e-07, + "logits/chosen": -0.19505472481250763, + "logits/rejected": -0.030590301379561424, + "logps/chosen": -2.7975356578826904, + "logps/rejected": -3.816844940185547, + "loss": 0.5086, + "rewards/accuracies": 0.793749988079071, + "rewards/chosen": -2.7975356578826904, + "rewards/margins": 1.0193090438842773, + "rewards/rejected": -3.816844940185547, + "sft_loss": 2.81569242477417, + "step": 3240 + }, + { + "epoch": 1.7367452751296204, + "grad_norm": 14.787915348963589, + "learning_rate": 4.4947081845539177e-07, + "logits/chosen": -0.35870975255966187, + "logits/rejected": -0.206782728433609, + "logps/chosen": -2.8665363788604736, + "logps/rejected": -3.6919448375701904, + "loss": 0.56, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -2.8665363788604736, + "rewards/margins": 0.8254083395004272, + "rewards/rejected": -3.6919448375701904, + "sft_loss": 2.9097416400909424, + "step": 3245 + }, + { + "epoch": 1.739421307911022, + "grad_norm": 12.871323062130514, + "learning_rate": 4.479216365386333e-07, + "logits/chosen": -0.19920530915260315, + "logits/rejected": 0.00923833716660738, + "logps/chosen": -2.8280484676361084, + "logps/rejected": -3.8082661628723145, + "loss": 0.5145, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -2.8280484676361084, + "rewards/margins": 0.980217456817627, + "rewards/rejected": -3.8082661628723145, + "sft_loss": 2.869939088821411, + "step": 3250 + }, + { + "epoch": 1.7420973406924234, + "grad_norm": 13.069487032537221, + "learning_rate": 4.4637295988638555e-07, + "logits/chosen": -0.21338963508605957, + "logits/rejected": -0.09642849117517471, + "logps/chosen": -2.8135313987731934, + "logps/rejected": -3.6506245136260986, + "loss": 0.5479, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -2.8135313987731934, + "rewards/margins": 0.8370929956436157, + "rewards/rejected": -3.6506245136260986, + "sft_loss": 2.9300215244293213, + "step": 3255 + }, + { + "epoch": 1.744773373473825, + "grad_norm": 21.564495906900454, + "learning_rate": 4.4482480352391623e-07, + "logits/chosen": -0.28455790877342224, + "logits/rejected": -0.12763725221157074, + "logps/chosen": -2.817474603652954, + "logps/rejected": -3.723423480987549, + "loss": 0.5153, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -2.817474603652954, + "rewards/margins": 0.90594881772995, + "rewards/rejected": -3.723423480987549, + "sft_loss": 2.937216281890869, + "step": 3260 + }, + { + "epoch": 1.7474494062552266, + "grad_norm": 23.117393791841398, + "learning_rate": 4.4327718247144507e-07, + "logits/chosen": -0.19001540541648865, + "logits/rejected": -0.04242430999875069, + "logps/chosen": -2.7809228897094727, + "logps/rejected": -3.6734752655029297, + "loss": 0.5425, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -2.7809228897094727, + "rewards/margins": 0.8925522565841675, + "rewards/rejected": -3.6734752655029297, + "sft_loss": 2.970322608947754, + "step": 3265 + }, + { + "epoch": 1.750125439036628, + "grad_norm": 18.135415288579836, + "learning_rate": 4.417301117439984e-07, + "logits/chosen": -0.238087460398674, + "logits/rejected": -0.09954921901226044, + "logps/chosen": -2.83843994140625, + "logps/rejected": -3.768962860107422, + "loss": 0.531, + "rewards/accuracies": 0.7562500238418579, + "rewards/chosen": -2.83843994140625, + "rewards/margins": 0.9305224418640137, + "rewards/rejected": -3.768962860107422, + "sft_loss": 2.9210972785949707, + "step": 3270 + }, + { + "epoch": 1.7528014718180298, + "grad_norm": 16.001325089443043, + "learning_rate": 4.401836063512631e-07, + "logits/chosen": -0.30430033802986145, + "logits/rejected": 0.05234812945127487, + "logps/chosen": -2.756070613861084, + "logps/rejected": -3.744997501373291, + "loss": 0.496, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -2.756070613861084, + "rewards/margins": 0.9889270663261414, + "rewards/rejected": -3.744997501373291, + "sft_loss": 2.9325056076049805, + "step": 3275 + }, + { + "epoch": 1.7554775045994313, + "grad_norm": 19.97497988116218, + "learning_rate": 4.386376812974413e-07, + "logits/chosen": -0.252798855304718, + "logits/rejected": -0.16332267224788666, + "logps/chosen": -2.5628037452697754, + "logps/rejected": -3.5728652477264404, + "loss": 0.5096, + "rewards/accuracies": 0.793749988079071, + "rewards/chosen": -2.5628037452697754, + "rewards/margins": 1.0100618600845337, + "rewards/rejected": -3.5728652477264404, + "sft_loss": 2.757195472717285, + "step": 3280 + }, + { + "epoch": 1.7581535373808328, + "grad_norm": 13.525894288892985, + "learning_rate": 4.370923515811048e-07, + "logits/chosen": -0.2700463831424713, + "logits/rejected": -0.023473823443055153, + "logps/chosen": -2.79638934135437, + "logps/rejected": -3.7672476768493652, + "loss": 0.5042, + "rewards/accuracies": 0.768750011920929, + "rewards/chosen": -2.79638934135437, + "rewards/margins": 0.9708584547042847, + "rewards/rejected": -3.7672476768493652, + "sft_loss": 2.8965699672698975, + "step": 3285 + }, + { + "epoch": 1.7608295701622345, + "grad_norm": 14.699034871188948, + "learning_rate": 4.35547632195049e-07, + "logits/chosen": -0.21370744705200195, + "logits/rejected": -0.08334928005933762, + "logps/chosen": -2.721179246902466, + "logps/rejected": -3.579859495162964, + "loss": 0.5228, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -2.721179246902466, + "rewards/margins": 0.858680248260498, + "rewards/rejected": -3.579859495162964, + "sft_loss": 2.878352165222168, + "step": 3290 + }, + { + "epoch": 1.763505602943636, + "grad_norm": 17.573298503902322, + "learning_rate": 4.340035381261484e-07, + "logits/chosen": -0.21696865558624268, + "logits/rejected": -0.09547743201255798, + "logps/chosen": -2.980989456176758, + "logps/rejected": -3.8878490924835205, + "loss": 0.5625, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -2.980989456176758, + "rewards/margins": 0.9068597555160522, + "rewards/rejected": -3.8878490924835205, + "sft_loss": 3.0405678749084473, + "step": 3295 + }, + { + "epoch": 1.7661816357250375, + "grad_norm": 24.841053261828165, + "learning_rate": 4.324600843552104e-07, + "logits/chosen": -0.31977978348731995, + "logits/rejected": -0.14189480245113373, + "logps/chosen": -3.106128692626953, + "logps/rejected": -4.027562618255615, + "loss": 0.5568, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -3.106128692626953, + "rewards/margins": 0.9214338064193726, + "rewards/rejected": -4.027562618255615, + "sft_loss": 3.2608656883239746, + "step": 3300 + }, + { + "epoch": 1.7688576685064392, + "grad_norm": 17.014889686723876, + "learning_rate": 4.309172858568302e-07, + "logits/chosen": -0.3131139874458313, + "logits/rejected": -0.08883488923311234, + "logps/chosen": -3.0094892978668213, + "logps/rejected": -3.9629604816436768, + "loss": 0.5439, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -3.0094892978668213, + "rewards/margins": 0.9534710645675659, + "rewards/rejected": -3.9629604816436768, + "sft_loss": 3.136040210723877, + "step": 3305 + }, + { + "epoch": 1.771533701287841, + "grad_norm": 20.074429307266534, + "learning_rate": 4.293751575992455e-07, + "logits/chosen": -0.12032179534435272, + "logits/rejected": -0.06570522487163544, + "logps/chosen": -3.0344769954681396, + "logps/rejected": -3.9326884746551514, + "loss": 0.5096, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -3.0344769954681396, + "rewards/margins": 0.898211658000946, + "rewards/rejected": -3.9326884746551514, + "sft_loss": 3.2100040912628174, + "step": 3310 + }, + { + "epoch": 1.7742097340692422, + "grad_norm": 26.469130295066087, + "learning_rate": 4.278337145441916e-07, + "logits/chosen": -0.26177433133125305, + "logits/rejected": -0.05351484566926956, + "logps/chosen": -3.0022337436676025, + "logps/rejected": -3.882744312286377, + "loss": 0.5514, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -3.0022337436676025, + "rewards/margins": 0.880510151386261, + "rewards/rejected": -3.882744312286377, + "sft_loss": 3.1298012733459473, + "step": 3315 + }, + { + "epoch": 1.776885766850644, + "grad_norm": 13.810006794454017, + "learning_rate": 4.262929716467556e-07, + "logits/chosen": -0.24092534184455872, + "logits/rejected": 0.03226093575358391, + "logps/chosen": -2.9213504791259766, + "logps/rejected": -3.9801437854766846, + "loss": 0.5226, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -2.9213504791259766, + "rewards/margins": 1.0587931871414185, + "rewards/rejected": -3.9801437854766846, + "sft_loss": 3.0345194339752197, + "step": 3320 + }, + { + "epoch": 1.7795617996320456, + "grad_norm": 13.141757845299274, + "learning_rate": 4.247529438552321e-07, + "logits/chosen": -0.25433140993118286, + "logits/rejected": -0.04708702117204666, + "logps/chosen": -2.885637044906616, + "logps/rejected": -3.7773375511169434, + "loss": 0.5466, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -2.885637044906616, + "rewards/margins": 0.8917006254196167, + "rewards/rejected": -3.7773375511169434, + "sft_loss": 3.123898983001709, + "step": 3325 + }, + { + "epoch": 1.782237832413447, + "grad_norm": 15.999497786068854, + "learning_rate": 4.232136461109773e-07, + "logits/chosen": -0.19254162907600403, + "logits/rejected": -0.05719348043203354, + "logps/chosen": -2.7694904804229736, + "logps/rejected": -3.805851697921753, + "loss": 0.4953, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -2.7694904804229736, + "rewards/margins": 1.0363613367080688, + "rewards/rejected": -3.805851697921753, + "sft_loss": 2.980656623840332, + "step": 3330 + }, + { + "epoch": 1.7849138651948486, + "grad_norm": 24.841428177881934, + "learning_rate": 4.216750933482646e-07, + "logits/chosen": -0.21872751414775848, + "logits/rejected": -0.017170961946249008, + "logps/chosen": -3.009554386138916, + "logps/rejected": -3.8017420768737793, + "loss": 0.563, + "rewards/accuracies": 0.75, + "rewards/chosen": -3.009554386138916, + "rewards/margins": 0.7921879887580872, + "rewards/rejected": -3.8017420768737793, + "sft_loss": 3.075871467590332, + "step": 3335 + }, + { + "epoch": 1.7875898979762503, + "grad_norm": 36.892800216700245, + "learning_rate": 4.2013730049413986e-07, + "logits/chosen": -0.19581346213817596, + "logits/rejected": -0.0030058815609663725, + "logps/chosen": -2.7792811393737793, + "logps/rejected": -3.8306567668914795, + "loss": 0.4923, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -2.7792811393737793, + "rewards/margins": 1.0513756275177002, + "rewards/rejected": -3.8306567668914795, + "sft_loss": 2.9651989936828613, + "step": 3340 + }, + { + "epoch": 1.7902659307576518, + "grad_norm": 15.101609080190148, + "learning_rate": 4.1860028246827594e-07, + "logits/chosen": -0.19188782572746277, + "logits/rejected": 0.03688093274831772, + "logps/chosen": -2.7170400619506836, + "logps/rejected": -3.64373779296875, + "loss": 0.5098, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": -2.7170400619506836, + "rewards/margins": 0.9266974329948425, + "rewards/rejected": -3.64373779296875, + "sft_loss": 2.9208431243896484, + "step": 3345 + }, + { + "epoch": 1.7929419635390533, + "grad_norm": 15.293234158359965, + "learning_rate": 4.170640541828285e-07, + "logits/chosen": -0.31982487440109253, + "logits/rejected": -0.16904591023921967, + "logps/chosen": -2.936917543411255, + "logps/rejected": -3.794989824295044, + "loss": 0.5417, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -2.936917543411255, + "rewards/margins": 0.8580719828605652, + "rewards/rejected": -3.794989824295044, + "sft_loss": 3.077047348022461, + "step": 3350 + }, + { + "epoch": 1.795617996320455, + "grad_norm": 19.198648656415443, + "learning_rate": 4.1552863054229116e-07, + "logits/chosen": -0.10441069304943085, + "logits/rejected": -0.027602875605225563, + "logps/chosen": -3.047877788543701, + "logps/rejected": -3.7978546619415283, + "loss": 0.6106, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -3.047877788543701, + "rewards/margins": 0.7499769330024719, + "rewards/rejected": -3.7978546619415283, + "sft_loss": 3.1034159660339355, + "step": 3355 + }, + { + "epoch": 1.7982940291018565, + "grad_norm": 16.89787950921183, + "learning_rate": 4.139940264433508e-07, + "logits/chosen": -0.2672869861125946, + "logits/rejected": -0.043167419731616974, + "logps/chosen": -2.7406458854675293, + "logps/rejected": -3.6946983337402344, + "loss": 0.5234, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -2.7406458854675293, + "rewards/margins": 0.9540519714355469, + "rewards/rejected": -3.6946983337402344, + "sft_loss": 2.8566882610321045, + "step": 3360 + }, + { + "epoch": 1.800970061883258, + "grad_norm": 15.023275891915496, + "learning_rate": 4.1246025677474303e-07, + "logits/chosen": -0.29182684421539307, + "logits/rejected": -0.07425285875797272, + "logps/chosen": -2.939763069152832, + "logps/rejected": -3.802227735519409, + "loss": 0.5382, + "rewards/accuracies": 0.7562500238418579, + "rewards/chosen": -2.939763069152832, + "rewards/margins": 0.8624647259712219, + "rewards/rejected": -3.802227735519409, + "sft_loss": 3.107689380645752, + "step": 3365 + }, + { + "epoch": 1.8036460946646597, + "grad_norm": 15.364491576601871, + "learning_rate": 4.10927336417108e-07, + "logits/chosen": -0.2519490122795105, + "logits/rejected": -0.06419762223958969, + "logps/chosen": -2.964268684387207, + "logps/rejected": -3.6328086853027344, + "loss": 0.6315, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -2.964268684387207, + "rewards/margins": 0.6685395240783691, + "rewards/rejected": -3.6328086853027344, + "sft_loss": 3.074256658554077, + "step": 3370 + }, + { + "epoch": 1.8063221274460612, + "grad_norm": 21.966152974131624, + "learning_rate": 4.093952802428457e-07, + "logits/chosen": -0.09806279838085175, + "logits/rejected": -0.012893694452941418, + "logps/chosen": -3.0203330516815186, + "logps/rejected": -3.806386947631836, + "loss": 0.6036, + "rewards/accuracies": 0.6875, + "rewards/chosen": -3.0203330516815186, + "rewards/margins": 0.7860537767410278, + "rewards/rejected": -3.806386947631836, + "sft_loss": 3.112948417663574, + "step": 3375 + }, + { + "epoch": 1.8089981602274627, + "grad_norm": 13.016070631296792, + "learning_rate": 4.0786410311597184e-07, + "logits/chosen": -0.3160150945186615, + "logits/rejected": -0.10090694576501846, + "logps/chosen": -2.8167786598205566, + "logps/rejected": -3.6615512371063232, + "loss": 0.5482, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -2.8167786598205566, + "rewards/margins": 0.8447723388671875, + "rewards/rejected": -3.6615512371063232, + "sft_loss": 2.863129138946533, + "step": 3380 + }, + { + "epoch": 1.8116741930088645, + "grad_norm": 14.525805888243237, + "learning_rate": 4.063338198919737e-07, + "logits/chosen": -0.2736918330192566, + "logits/rejected": -0.23787431418895721, + "logps/chosen": -2.814265012741089, + "logps/rejected": -3.5256824493408203, + "loss": 0.5887, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -2.814265012741089, + "rewards/margins": 0.7114171981811523, + "rewards/rejected": -3.5256824493408203, + "sft_loss": 2.904420852661133, + "step": 3385 + }, + { + "epoch": 1.814350225790266, + "grad_norm": 22.23247451095, + "learning_rate": 4.0480444541766575e-07, + "logits/chosen": -0.2675407826900482, + "logits/rejected": -0.09931263327598572, + "logps/chosen": -2.8064563274383545, + "logps/rejected": -3.5337014198303223, + "loss": 0.5936, + "rewards/accuracies": 0.6875, + "rewards/chosen": -2.8064563274383545, + "rewards/margins": 0.727245032787323, + "rewards/rejected": -3.5337014198303223, + "sft_loss": 2.860211133956909, + "step": 3390 + }, + { + "epoch": 1.8170262585716674, + "grad_norm": 14.488352793649376, + "learning_rate": 4.0327599453104606e-07, + "logits/chosen": -0.30500540137290955, + "logits/rejected": -0.12484397739171982, + "logps/chosen": -2.6433451175689697, + "logps/rejected": -3.6233038902282715, + "loss": 0.497, + "rewards/accuracies": 0.78125, + "rewards/chosen": -2.6433451175689697, + "rewards/margins": 0.9799593091011047, + "rewards/rejected": -3.6233038902282715, + "sft_loss": 2.772120952606201, + "step": 3395 + }, + { + "epoch": 1.8197022913530692, + "grad_norm": 18.939007337615287, + "learning_rate": 4.017484820611514e-07, + "logits/chosen": -0.22828754782676697, + "logits/rejected": -0.0670999065041542, + "logps/chosen": -2.731137275695801, + "logps/rejected": -3.5642154216766357, + "loss": 0.5295, + "rewards/accuracies": 0.7562500238418579, + "rewards/chosen": -2.731137275695801, + "rewards/margins": 0.8330783843994141, + "rewards/rejected": -3.5642154216766357, + "sft_loss": 2.819455623626709, + "step": 3400 + }, + { + "epoch": 1.8223783241344707, + "grad_norm": 13.642105553707072, + "learning_rate": 4.002219228279148e-07, + "logits/chosen": -0.2647199034690857, + "logits/rejected": -0.09628833085298538, + "logps/chosen": -2.7119967937469482, + "logps/rejected": -3.5323944091796875, + "loss": 0.5146, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -2.7119967937469482, + "rewards/margins": 0.8203978538513184, + "rewards/rejected": -3.5323944091796875, + "sft_loss": 2.823310613632202, + "step": 3405 + }, + { + "epoch": 1.8250543569158721, + "grad_norm": 15.604402083872163, + "learning_rate": 3.9869633164202045e-07, + "logits/chosen": -0.2539847493171692, + "logits/rejected": 0.020132040604948997, + "logps/chosen": -2.849400043487549, + "logps/rejected": -3.6440062522888184, + "loss": 0.5334, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -2.849400043487549, + "rewards/margins": 0.7946061491966248, + "rewards/rejected": -3.6440062522888184, + "sft_loss": 2.8739774227142334, + "step": 3410 + }, + { + "epoch": 1.8277303896972739, + "grad_norm": 21.899598807926793, + "learning_rate": 3.9717172330476077e-07, + "logits/chosen": -0.2510986924171448, + "logits/rejected": -0.10071302950382233, + "logps/chosen": -2.827031135559082, + "logps/rejected": -3.734854221343994, + "loss": 0.5492, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": -2.827031135559082, + "rewards/margins": 0.9078229665756226, + "rewards/rejected": -3.734854221343994, + "sft_loss": 2.97774076461792, + "step": 3415 + }, + { + "epoch": 1.8304064224786754, + "grad_norm": 20.063555498521275, + "learning_rate": 3.956481126078927e-07, + "logits/chosen": -0.14432759582996368, + "logits/rejected": -0.011064152233302593, + "logps/chosen": -2.895400047302246, + "logps/rejected": -3.739138126373291, + "loss": 0.5999, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -2.895400047302246, + "rewards/margins": 0.8437372446060181, + "rewards/rejected": -3.739138126373291, + "sft_loss": 3.0360639095306396, + "step": 3420 + }, + { + "epoch": 1.8330824552600768, + "grad_norm": 13.483525350613622, + "learning_rate": 3.941255143334937e-07, + "logits/chosen": -0.2542577385902405, + "logits/rejected": -0.19230565428733826, + "logps/chosen": -2.8025808334350586, + "logps/rejected": -3.6844921112060547, + "loss": 0.5318, + "rewards/accuracies": 0.78125, + "rewards/chosen": -2.8025808334350586, + "rewards/margins": 0.8819111585617065, + "rewards/rejected": -3.6844921112060547, + "sft_loss": 2.8600831031799316, + "step": 3425 + }, + { + "epoch": 1.8357584880414786, + "grad_norm": 16.43243776872195, + "learning_rate": 3.9260394325381895e-07, + "logits/chosen": -0.22831299901008606, + "logits/rejected": -0.075655996799469, + "logps/chosen": -2.699906587600708, + "logps/rejected": -3.8140316009521484, + "loss": 0.4896, + "rewards/accuracies": 0.84375, + "rewards/chosen": -2.699906587600708, + "rewards/margins": 1.1141245365142822, + "rewards/rejected": -3.8140316009521484, + "sft_loss": 2.791806697845459, + "step": 3430 + }, + { + "epoch": 1.83843452082288, + "grad_norm": 15.428523896946192, + "learning_rate": 3.9108341413115784e-07, + "logits/chosen": -0.24065284430980682, + "logits/rejected": -0.09862431138753891, + "logps/chosen": -2.6614131927490234, + "logps/rejected": -3.627790927886963, + "loss": 0.4677, + "rewards/accuracies": 0.84375, + "rewards/chosen": -2.6614131927490234, + "rewards/margins": 0.9663776159286499, + "rewards/rejected": -3.627790927886963, + "sft_loss": 2.8119826316833496, + "step": 3435 + }, + { + "epoch": 1.8411105536042816, + "grad_norm": 19.851445861786452, + "learning_rate": 3.895639417176905e-07, + "logits/chosen": -0.2709447145462036, + "logits/rejected": -0.14585816860198975, + "logps/chosen": -2.7599568367004395, + "logps/rejected": -3.7462761402130127, + "loss": 0.5487, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -2.7599568367004395, + "rewards/margins": 0.9863188862800598, + "rewards/rejected": -3.7462761402130127, + "sft_loss": 2.869521141052246, + "step": 3440 + }, + { + "epoch": 1.8437865863856833, + "grad_norm": 15.225388707062304, + "learning_rate": 3.8804554075534497e-07, + "logits/chosen": -0.2733847498893738, + "logits/rejected": -0.026109689846634865, + "logps/chosen": -2.805610418319702, + "logps/rejected": -3.7957942485809326, + "loss": 0.5148, + "rewards/accuracies": 0.75, + "rewards/chosen": -2.805610418319702, + "rewards/margins": 0.9901833534240723, + "rewards/rejected": -3.7957942485809326, + "sft_loss": 2.947772264480591, + "step": 3445 + }, + { + "epoch": 1.8464626191670848, + "grad_norm": 15.99874798191993, + "learning_rate": 3.8652822597565403e-07, + "logits/chosen": -0.3326260447502136, + "logits/rejected": -0.12364955246448517, + "logps/chosen": -2.821265459060669, + "logps/rejected": -3.783513307571411, + "loss": 0.5188, + "rewards/accuracies": 0.75, + "rewards/chosen": -2.821265459060669, + "rewards/margins": 0.9622477293014526, + "rewards/rejected": -3.783513307571411, + "sft_loss": 2.948594570159912, + "step": 3450 + }, + { + "epoch": 1.8491386519484863, + "grad_norm": 17.173300996571587, + "learning_rate": 3.850120120996123e-07, + "logits/chosen": -0.23530642688274384, + "logits/rejected": 0.0075013055466115475, + "logps/chosen": -2.9990508556365967, + "logps/rejected": -3.8762524127960205, + "loss": 0.562, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -2.9990508556365967, + "rewards/margins": 0.8772016763687134, + "rewards/rejected": -3.8762524127960205, + "sft_loss": 3.0761046409606934, + "step": 3455 + }, + { + "epoch": 1.851814684729888, + "grad_norm": 17.42784087365534, + "learning_rate": 3.8349691383753356e-07, + "logits/chosen": -0.13078606128692627, + "logits/rejected": 0.005738553591072559, + "logps/chosen": -2.811361789703369, + "logps/rejected": -3.7540740966796875, + "loss": 0.5298, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -2.811361789703369, + "rewards/margins": 0.9427124261856079, + "rewards/rejected": -3.7540740966796875, + "sft_loss": 2.885227680206299, + "step": 3460 + }, + { + "epoch": 1.8544907175112895, + "grad_norm": 11.970284039619953, + "learning_rate": 3.819829458889078e-07, + "logits/chosen": -0.23028437793254852, + "logits/rejected": -0.10200443118810654, + "logps/chosen": -2.7681221961975098, + "logps/rejected": -3.6479923725128174, + "loss": 0.534, + "rewards/accuracies": 0.8062499761581421, + "rewards/chosen": -2.7681221961975098, + "rewards/margins": 0.8798701167106628, + "rewards/rejected": -3.6479923725128174, + "sft_loss": 2.8739943504333496, + "step": 3465 + }, + { + "epoch": 1.857166750292691, + "grad_norm": 15.818645996946636, + "learning_rate": 3.804701229422585e-07, + "logits/chosen": -0.2516114115715027, + "logits/rejected": -0.13954241573810577, + "logps/chosen": -2.9505364894866943, + "logps/rejected": -3.8601531982421875, + "loss": 0.5392, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -2.9505364894866943, + "rewards/margins": 0.9096164703369141, + "rewards/rejected": -3.8601531982421875, + "sft_loss": 3.0391030311584473, + "step": 3470 + }, + { + "epoch": 1.8598427830740927, + "grad_norm": 18.35439781552601, + "learning_rate": 3.789584596750007e-07, + "logits/chosen": -0.22998587787151337, + "logits/rejected": -0.16458949446678162, + "logps/chosen": -2.8295726776123047, + "logps/rejected": -3.6990628242492676, + "loss": 0.5392, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -2.8295726776123047, + "rewards/margins": 0.8694899678230286, + "rewards/rejected": -3.6990628242492676, + "sft_loss": 2.8997833728790283, + "step": 3475 + }, + { + "epoch": 1.8625188158554944, + "grad_norm": 15.377452961017998, + "learning_rate": 3.77447970753298e-07, + "logits/chosen": -0.11249478161334991, + "logits/rejected": -0.07554732263088226, + "logps/chosen": -2.8798892498016357, + "logps/rejected": -3.7367751598358154, + "loss": 0.543, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -2.8798892498016357, + "rewards/margins": 0.8568856120109558, + "rewards/rejected": -3.7367751598358154, + "sft_loss": 3.0244550704956055, + "step": 3480 + }, + { + "epoch": 1.8651948486368957, + "grad_norm": 17.636805330875724, + "learning_rate": 3.7593867083192057e-07, + "logits/chosen": -0.19624033570289612, + "logits/rejected": -0.01715945638716221, + "logps/chosen": -2.7213451862335205, + "logps/rejected": -3.5876636505126953, + "loss": 0.555, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": -2.7213451862335205, + "rewards/margins": 0.8663187026977539, + "rewards/rejected": -3.5876636505126953, + "sft_loss": 2.8986878395080566, + "step": 3485 + }, + { + "epoch": 1.8678708814182974, + "grad_norm": 16.273522498705884, + "learning_rate": 3.7443057455410276e-07, + "logits/chosen": -0.19358979165554047, + "logits/rejected": -0.05072442814707756, + "logps/chosen": -2.7149434089660645, + "logps/rejected": -3.714277982711792, + "loss": 0.478, + "rewards/accuracies": 0.8187500238418579, + "rewards/chosen": -2.7149434089660645, + "rewards/margins": 0.999334454536438, + "rewards/rejected": -3.714277982711792, + "sft_loss": 2.915403127670288, + "step": 3490 + }, + { + "epoch": 1.870546914199699, + "grad_norm": 13.653360966622175, + "learning_rate": 3.7292369655140145e-07, + "logits/chosen": -0.29655593633651733, + "logits/rejected": -0.09064863622188568, + "logps/chosen": -2.846242904663086, + "logps/rejected": -3.625659465789795, + "loss": 0.5148, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -2.846242904663086, + "rewards/margins": 0.779416561126709, + "rewards/rejected": -3.625659465789795, + "sft_loss": 3.0332603454589844, + "step": 3495 + }, + { + "epoch": 1.8732229469811004, + "grad_norm": 15.100840540865368, + "learning_rate": 3.714180514435534e-07, + "logits/chosen": -0.16411438584327698, + "logits/rejected": 0.02352045103907585, + "logps/chosen": -2.780078411102295, + "logps/rejected": -3.7550857067108154, + "loss": 0.5162, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -2.780078411102295, + "rewards/margins": 0.9750074148178101, + "rewards/rejected": -3.7550857067108154, + "sft_loss": 2.971510648727417, + "step": 3500 + }, + { + "epoch": 1.875898979762502, + "grad_norm": 18.8741615601585, + "learning_rate": 3.6991365383833426e-07, + "logits/chosen": -0.2565234303474426, + "logits/rejected": -0.07370957732200623, + "logps/chosen": -2.813046932220459, + "logps/rejected": -3.7858214378356934, + "loss": 0.4985, + "rewards/accuracies": 0.793749988079071, + "rewards/chosen": -2.813046932220459, + "rewards/margins": 0.9727746248245239, + "rewards/rejected": -3.7858214378356934, + "sft_loss": 3.0226640701293945, + "step": 3505 + }, + { + "epoch": 1.8785750125439038, + "grad_norm": 19.983117937550535, + "learning_rate": 3.684105183314162e-07, + "logits/chosen": -0.2373221218585968, + "logits/rejected": -0.12379314005374908, + "logps/chosen": -2.7259597778320312, + "logps/rejected": -3.637960433959961, + "loss": 0.5015, + "rewards/accuracies": 0.8125, + "rewards/chosen": -2.7259597778320312, + "rewards/margins": 0.9120001792907715, + "rewards/rejected": -3.637960433959961, + "sft_loss": 2.852381467819214, + "step": 3510 + }, + { + "epoch": 1.881251045325305, + "grad_norm": 18.11630276511099, + "learning_rate": 3.669086595062263e-07, + "logits/chosen": -0.23923330008983612, + "logits/rejected": -0.0036049566697329283, + "logps/chosen": -2.9402759075164795, + "logps/rejected": -3.849766492843628, + "loss": 0.5202, + "rewards/accuracies": 0.75, + "rewards/chosen": -2.9402759075164795, + "rewards/margins": 0.9094909429550171, + "rewards/rejected": -3.849766492843628, + "sft_loss": 3.062849521636963, + "step": 3515 + }, + { + "epoch": 1.8839270781067068, + "grad_norm": 14.118071827863625, + "learning_rate": 3.654080919338056e-07, + "logits/chosen": -0.28001868724823, + "logits/rejected": -0.07703977078199387, + "logps/chosen": -2.8357691764831543, + "logps/rejected": -3.7062485218048096, + "loss": 0.5369, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": -2.8357691764831543, + "rewards/margins": 0.8704794049263, + "rewards/rejected": -3.7062485218048096, + "sft_loss": 3.0368707180023193, + "step": 3520 + }, + { + "epoch": 1.8866031108881085, + "grad_norm": 16.525389106043953, + "learning_rate": 3.639088301726673e-07, + "logits/chosen": -0.20582440495491028, + "logits/rejected": 0.0367545410990715, + "logps/chosen": -2.8971991539001465, + "logps/rejected": -3.7518577575683594, + "loss": 0.5588, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -2.8971991539001465, + "rewards/margins": 0.8546587824821472, + "rewards/rejected": -3.7518577575683594, + "sft_loss": 3.0653936862945557, + "step": 3525 + }, + { + "epoch": 1.88927914366951, + "grad_norm": 20.9316555158969, + "learning_rate": 3.624108887686556e-07, + "logits/chosen": -0.20897629857063293, + "logits/rejected": -0.11267737299203873, + "logps/chosen": -2.93704891204834, + "logps/rejected": -3.8252410888671875, + "loss": 0.5127, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -2.93704891204834, + "rewards/margins": 0.888192355632782, + "rewards/rejected": -3.8252410888671875, + "sft_loss": 3.1720235347747803, + "step": 3530 + }, + { + "epoch": 1.8919551764509115, + "grad_norm": 12.859754973839012, + "learning_rate": 3.6091428225480433e-07, + "logits/chosen": -0.2824031412601471, + "logits/rejected": -0.11336223781108856, + "logps/chosen": -2.920830488204956, + "logps/rejected": -3.8924171924591064, + "loss": 0.5286, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -2.920830488204956, + "rewards/margins": 0.9715864062309265, + "rewards/rejected": -3.8924171924591064, + "sft_loss": 3.1959595680236816, + "step": 3535 + }, + { + "epoch": 1.8946312092323132, + "grad_norm": 20.170217014416863, + "learning_rate": 3.5941902515119674e-07, + "logits/chosen": -0.27359622716903687, + "logits/rejected": -0.008305774070322514, + "logps/chosen": -2.9588866233825684, + "logps/rejected": -3.740037202835083, + "loss": 0.5759, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -2.9588866233825684, + "rewards/margins": 0.7811505198478699, + "rewards/rejected": -3.740037202835083, + "sft_loss": 3.141091823577881, + "step": 3540 + }, + { + "epoch": 1.8973072420137147, + "grad_norm": 18.208678592847644, + "learning_rate": 3.5792513196482373e-07, + "logits/chosen": -0.38286128640174866, + "logits/rejected": -0.04241828992962837, + "logps/chosen": -2.8372414112091064, + "logps/rejected": -3.7894434928894043, + "loss": 0.4823, + "rewards/accuracies": 0.793749988079071, + "rewards/chosen": -2.8372414112091064, + "rewards/margins": 0.952202320098877, + "rewards/rejected": -3.7894434928894043, + "sft_loss": 2.954357624053955, + "step": 3545 + }, + { + "epoch": 1.8999832747951162, + "grad_norm": 16.611233931970162, + "learning_rate": 3.5643261718944346e-07, + "logits/chosen": -0.13990192115306854, + "logits/rejected": -0.02662300132215023, + "logps/chosen": -2.934990644454956, + "logps/rejected": -3.732720136642456, + "loss": 0.5683, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -2.934990644454956, + "rewards/margins": 0.7977299094200134, + "rewards/rejected": -3.732720136642456, + "sft_loss": 2.920933246612549, + "step": 3550 + }, + { + "epoch": 1.902659307576518, + "grad_norm": 14.196617770384604, + "learning_rate": 3.5494149530544087e-07, + "logits/chosen": -0.29803937673568726, + "logits/rejected": -0.16058126091957092, + "logps/chosen": -2.825559139251709, + "logps/rejected": -3.727395534515381, + "loss": 0.5747, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -2.825559139251709, + "rewards/margins": 0.9018365740776062, + "rewards/rejected": -3.727395534515381, + "sft_loss": 2.9392800331115723, + "step": 3555 + }, + { + "epoch": 1.9053353403579194, + "grad_norm": 17.050120292129, + "learning_rate": 3.534517807796871e-07, + "logits/chosen": -0.24503159523010254, + "logits/rejected": -0.10288417339324951, + "logps/chosen": -2.7696847915649414, + "logps/rejected": -3.6267178058624268, + "loss": 0.5248, + "rewards/accuracies": 0.71875, + "rewards/chosen": -2.7696847915649414, + "rewards/margins": 0.8570332527160645, + "rewards/rejected": -3.6267178058624268, + "sft_loss": 2.9060094356536865, + "step": 3560 + }, + { + "epoch": 1.908011373139321, + "grad_norm": 15.24198881267226, + "learning_rate": 3.519634880653988e-07, + "logits/chosen": -0.20890846848487854, + "logits/rejected": -0.07936038076877594, + "logps/chosen": -2.9502017498016357, + "logps/rejected": -3.9783225059509277, + "loss": 0.5096, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -2.9502017498016357, + "rewards/margins": 1.028120756149292, + "rewards/rejected": -3.9783225059509277, + "sft_loss": 3.110295057296753, + "step": 3565 + }, + { + "epoch": 1.9106874059207226, + "grad_norm": 13.753923022395519, + "learning_rate": 3.504766316019987e-07, + "logits/chosen": -0.28474587202072144, + "logits/rejected": -0.05706968158483505, + "logps/chosen": -2.75578236579895, + "logps/rejected": -3.6933975219726562, + "loss": 0.4884, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -2.75578236579895, + "rewards/margins": 0.9376150965690613, + "rewards/rejected": -3.6933975219726562, + "sft_loss": 2.853646755218506, + "step": 3570 + }, + { + "epoch": 1.913363438702124, + "grad_norm": 12.325171877103088, + "learning_rate": 3.489912258149745e-07, + "logits/chosen": -0.16082385182380676, + "logits/rejected": -0.013875825330615044, + "logps/chosen": -2.8332247734069824, + "logps/rejected": -3.8153090476989746, + "loss": 0.545, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": -2.8332247734069824, + "rewards/margins": 0.9820839166641235, + "rewards/rejected": -3.8153090476989746, + "sft_loss": 2.911475419998169, + "step": 3575 + }, + { + "epoch": 1.9160394714835256, + "grad_norm": 12.90296161207828, + "learning_rate": 3.475072851157397e-07, + "logits/chosen": -0.20278310775756836, + "logits/rejected": -0.11776771396398544, + "logps/chosen": -2.810978889465332, + "logps/rejected": -3.799607038497925, + "loss": 0.4928, + "rewards/accuracies": 0.7562500238418579, + "rewards/chosen": -2.810978889465332, + "rewards/margins": 0.9886280298233032, + "rewards/rejected": -3.799607038497925, + "sft_loss": 2.984903573989868, + "step": 3580 + }, + { + "epoch": 1.9187155042649273, + "grad_norm": 13.919607678377089, + "learning_rate": 3.460248239014936e-07, + "logits/chosen": -0.1239798441529274, + "logits/rejected": -0.034661222249269485, + "logps/chosen": -2.9402782917022705, + "logps/rejected": -3.8812129497528076, + "loss": 0.5112, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -2.9402782917022705, + "rewards/margins": 0.9409344792366028, + "rewards/rejected": -3.8812129497528076, + "sft_loss": 3.1353812217712402, + "step": 3585 + }, + { + "epoch": 1.9213915370463288, + "grad_norm": 13.995097148662646, + "learning_rate": 3.4454385655508134e-07, + "logits/chosen": -0.16172340512275696, + "logits/rejected": -0.08319219201803207, + "logps/chosen": -2.9241456985473633, + "logps/rejected": -3.6937434673309326, + "loss": 0.5888, + "rewards/accuracies": 0.65625, + "rewards/chosen": -2.9241456985473633, + "rewards/margins": 0.7695978879928589, + "rewards/rejected": -3.6937434673309326, + "sft_loss": 3.051616668701172, + "step": 3590 + }, + { + "epoch": 1.9240675698277303, + "grad_norm": 11.714401564603223, + "learning_rate": 3.4306439744485447e-07, + "logits/chosen": -0.2790454626083374, + "logits/rejected": -0.04525241255760193, + "logps/chosen": -2.9379630088806152, + "logps/rejected": -3.804042100906372, + "loss": 0.5476, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -2.9379630088806152, + "rewards/margins": 0.8660792112350464, + "rewards/rejected": -3.804042100906372, + "sft_loss": 2.9716274738311768, + "step": 3595 + }, + { + "epoch": 1.926743602609132, + "grad_norm": 16.019929188124948, + "learning_rate": 3.415864609245322e-07, + "logits/chosen": -0.17339129745960236, + "logits/rejected": 0.01918947696685791, + "logps/chosen": -2.9560437202453613, + "logps/rejected": -3.9043736457824707, + "loss": 0.5625, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -2.9560437202453613, + "rewards/margins": 0.9483293294906616, + "rewards/rejected": -3.9043736457824707, + "sft_loss": 3.161888360977173, + "step": 3600 + }, + { + "epoch": 1.926743602609132, + "eval_logits/chosen": 0.24408887326717377, + "eval_logits/rejected": 0.36701610684394836, + "eval_logps/chosen": -2.859692335128784, + "eval_logps/rejected": -3.7385363578796387, + "eval_loss": 0.5478324294090271, + "eval_rewards/accuracies": 0.7255192995071411, + "eval_rewards/chosen": -2.859692335128784, + "eval_rewards/margins": 0.878844141960144, + "eval_rewards/rejected": -3.7385363578796387, + "eval_runtime": 50.1277, + "eval_samples_per_second": 26.831, + "eval_sft_loss": 3.0563251972198486, + "eval_steps_per_second": 6.723, + "step": 3600 + }, + { + "epoch": 1.9294196353905335, + "grad_norm": 15.270554541376217, + "learning_rate": 3.401100613330605e-07, + "logits/chosen": -0.23438136279582977, + "logits/rejected": -0.19709154963493347, + "logps/chosen": -2.741083860397339, + "logps/rejected": -3.5722594261169434, + "loss": 0.5405, + "rewards/accuracies": 0.71875, + "rewards/chosen": -2.741083860397339, + "rewards/margins": 0.8311758041381836, + "rewards/rejected": -3.5722594261169434, + "sft_loss": 2.9489662647247314, + "step": 3605 + }, + { + "epoch": 1.932095668171935, + "grad_norm": 14.471201812410206, + "learning_rate": 3.3863521299447514e-07, + "logits/chosen": -0.2784094214439392, + "logits/rejected": -0.0939759910106659, + "logps/chosen": -2.7622246742248535, + "logps/rejected": -3.6392664909362793, + "loss": 0.5004, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -2.7622246742248535, + "rewards/margins": 0.8770421147346497, + "rewards/rejected": -3.6392664909362793, + "sft_loss": 2.948101758956909, + "step": 3610 + }, + { + "epoch": 1.9347717009533367, + "grad_norm": 15.877868402206891, + "learning_rate": 3.371619302177609e-07, + "logits/chosen": -0.15976648032665253, + "logits/rejected": 0.0016909487312659621, + "logps/chosen": -2.9228010177612305, + "logps/rejected": -3.85103178024292, + "loss": 0.5133, + "rewards/accuracies": 0.7562500238418579, + "rewards/chosen": -2.9228010177612305, + "rewards/margins": 0.9282311201095581, + "rewards/rejected": -3.85103178024292, + "sft_loss": 3.01761794090271, + "step": 3615 + }, + { + "epoch": 1.9374477337347382, + "grad_norm": 21.44056032546373, + "learning_rate": 3.3569022729671393e-07, + "logits/chosen": -0.1950550526380539, + "logits/rejected": -0.07320324331521988, + "logps/chosen": -3.0602307319641113, + "logps/rejected": -3.849700927734375, + "loss": 0.5615, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -3.0602307319641113, + "rewards/margins": 0.7894704937934875, + "rewards/rejected": -3.849700927734375, + "sft_loss": 3.249194622039795, + "step": 3620 + }, + { + "epoch": 1.9401237665161397, + "grad_norm": 16.019071416415215, + "learning_rate": 3.342201185098024e-07, + "logits/chosen": -0.15915197134017944, + "logits/rejected": -0.1630932092666626, + "logps/chosen": -2.8209826946258545, + "logps/rejected": -3.661130905151367, + "loss": 0.5292, + "rewards/accuracies": 0.7562500238418579, + "rewards/chosen": -2.8209826946258545, + "rewards/margins": 0.8401481509208679, + "rewards/rejected": -3.661130905151367, + "sft_loss": 2.9431231021881104, + "step": 3625 + }, + { + "epoch": 1.9427997992975414, + "grad_norm": 15.905398766305188, + "learning_rate": 3.3275161812002807e-07, + "logits/chosen": -0.21201694011688232, + "logits/rejected": -0.15958142280578613, + "logps/chosen": -2.9415431022644043, + "logps/rejected": -3.8751723766326904, + "loss": 0.5645, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -2.9415431022644043, + "rewards/margins": 0.9336287379264832, + "rewards/rejected": -3.8751723766326904, + "sft_loss": 3.1352477073669434, + "step": 3630 + }, + { + "epoch": 1.945475832078943, + "grad_norm": 13.58100398003522, + "learning_rate": 3.312847403747883e-07, + "logits/chosen": -0.272103875875473, + "logits/rejected": -0.13591468334197998, + "logps/chosen": -2.8277993202209473, + "logps/rejected": -3.806042432785034, + "loss": 0.4966, + "rewards/accuracies": 0.78125, + "rewards/chosen": -2.8277993202209473, + "rewards/margins": 0.9782431721687317, + "rewards/rejected": -3.806042432785034, + "sft_loss": 2.9858555793762207, + "step": 3635 + }, + { + "epoch": 1.9481518648603444, + "grad_norm": 15.315418911990719, + "learning_rate": 3.2981949950573733e-07, + "logits/chosen": -0.21761062741279602, + "logits/rejected": -0.12431806325912476, + "logps/chosen": -2.9406609535217285, + "logps/rejected": -3.739722490310669, + "loss": 0.5394, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -2.9406609535217285, + "rewards/margins": 0.7990615367889404, + "rewards/rejected": -3.739722490310669, + "sft_loss": 3.1177260875701904, + "step": 3640 + }, + { + "epoch": 1.9508278976417461, + "grad_norm": 14.116155538156312, + "learning_rate": 3.283559097286486e-07, + "logits/chosen": -0.24495331943035126, + "logits/rejected": -0.08875279128551483, + "logps/chosen": -2.9264073371887207, + "logps/rejected": -3.5987040996551514, + "loss": 0.5746, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -2.9264073371887207, + "rewards/margins": 0.6722965240478516, + "rewards/rejected": -3.5987040996551514, + "sft_loss": 3.0629003047943115, + "step": 3645 + }, + { + "epoch": 1.9535039304231478, + "grad_norm": 17.25268549247319, + "learning_rate": 3.268939852432765e-07, + "logits/chosen": -0.29995864629745483, + "logits/rejected": -0.17995992302894592, + "logps/chosen": -3.043863296508789, + "logps/rejected": -3.762787342071533, + "loss": 0.5848, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -3.043863296508789, + "rewards/margins": 0.7189238667488098, + "rewards/rejected": -3.762787342071533, + "sft_loss": 3.2166271209716797, + "step": 3650 + }, + { + "epoch": 1.9561799632045491, + "grad_norm": 21.013060863581366, + "learning_rate": 3.254337402332187e-07, + "logits/chosen": -0.21988160908222198, + "logits/rejected": -0.0836673155426979, + "logps/chosen": -2.945904493331909, + "logps/rejected": -3.8223743438720703, + "loss": 0.5408, + "rewards/accuracies": 0.75, + "rewards/chosen": -2.945904493331909, + "rewards/margins": 0.876469612121582, + "rewards/rejected": -3.8223743438720703, + "sft_loss": 3.0382790565490723, + "step": 3655 + }, + { + "epoch": 1.9588559959859508, + "grad_norm": 13.024525263591862, + "learning_rate": 3.239751888657788e-07, + "logits/chosen": -0.2342025488615036, + "logits/rejected": -0.06337428838014603, + "logps/chosen": -2.926807165145874, + "logps/rejected": -3.78998064994812, + "loss": 0.5428, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -2.926807165145874, + "rewards/margins": 0.8631734848022461, + "rewards/rejected": -3.78998064994812, + "sft_loss": 3.1329076290130615, + "step": 3660 + }, + { + "epoch": 1.9615320287673526, + "grad_norm": 14.471523363662126, + "learning_rate": 3.2251834529182856e-07, + "logits/chosen": -0.2262742817401886, + "logits/rejected": -0.10412702709436417, + "logps/chosen": -2.7210283279418945, + "logps/rejected": -3.677239179611206, + "loss": 0.5285, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -2.7210283279418945, + "rewards/margins": 0.9562112092971802, + "rewards/rejected": -3.677239179611206, + "sft_loss": 2.808276414871216, + "step": 3665 + }, + { + "epoch": 1.9642080615487538, + "grad_norm": 13.1135910496621, + "learning_rate": 3.2106322364567075e-07, + "logits/chosen": -0.22553035616874695, + "logits/rejected": -0.0694446787238121, + "logps/chosen": -2.8147988319396973, + "logps/rejected": -3.906506299972534, + "loss": 0.4704, + "rewards/accuracies": 0.8187500238418579, + "rewards/chosen": -2.8147988319396973, + "rewards/margins": 1.091707468032837, + "rewards/rejected": -3.906506299972534, + "sft_loss": 3.0503079891204834, + "step": 3670 + }, + { + "epoch": 1.9668840943301555, + "grad_norm": 15.40942759554274, + "learning_rate": 3.1960983804490183e-07, + "logits/chosen": -0.2783694863319397, + "logits/rejected": -0.10769981145858765, + "logps/chosen": -3.0342190265655518, + "logps/rejected": -3.9841713905334473, + "loss": 0.5712, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -3.0342190265655518, + "rewards/margins": 0.9499521255493164, + "rewards/rejected": -3.9841713905334473, + "sft_loss": 3.1858086585998535, + "step": 3675 + }, + { + "epoch": 1.9695601271115573, + "grad_norm": 14.517911574149721, + "learning_rate": 3.1815820259027537e-07, + "logits/chosen": -0.26202255487442017, + "logits/rejected": -0.11341051012277603, + "logps/chosen": -2.6334633827209473, + "logps/rejected": -3.5371620655059814, + "loss": 0.4942, + "rewards/accuracies": 0.768750011920929, + "rewards/chosen": -2.6334633827209473, + "rewards/margins": 0.9036985635757446, + "rewards/rejected": -3.5371620655059814, + "sft_loss": 2.8220176696777344, + "step": 3680 + }, + { + "epoch": 1.9722361598929585, + "grad_norm": 18.456034604568856, + "learning_rate": 3.16708331365565e-07, + "logits/chosen": -0.27187708020210266, + "logits/rejected": -0.1584286093711853, + "logps/chosen": -2.8514575958251953, + "logps/rejected": -3.7883028984069824, + "loss": 0.5393, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -2.8514575958251953, + "rewards/margins": 0.936845600605011, + "rewards/rejected": -3.7883028984069824, + "sft_loss": 3.0907986164093018, + "step": 3685 + }, + { + "epoch": 1.9749121926743602, + "grad_norm": 12.80151079203658, + "learning_rate": 3.152602384374275e-07, + "logits/chosen": -0.2671336829662323, + "logits/rejected": -0.05357428267598152, + "logps/chosen": -2.947993040084839, + "logps/rejected": -3.795607089996338, + "loss": 0.5436, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": -2.947993040084839, + "rewards/margins": 0.8476136922836304, + "rewards/rejected": -3.795607089996338, + "sft_loss": 3.047563076019287, + "step": 3690 + }, + { + "epoch": 1.977588225455762, + "grad_norm": 16.203253604205564, + "learning_rate": 3.1381393785526697e-07, + "logits/chosen": -0.27621060609817505, + "logits/rejected": -0.1704329550266266, + "logps/chosen": -2.8942766189575195, + "logps/rejected": -3.7458534240722656, + "loss": 0.552, + "rewards/accuracies": 0.768750011920929, + "rewards/chosen": -2.8942766189575195, + "rewards/margins": 0.8515765070915222, + "rewards/rejected": -3.7458534240722656, + "sft_loss": 3.07773494720459, + "step": 3695 + }, + { + "epoch": 1.9802642582371635, + "grad_norm": 14.229932105580046, + "learning_rate": 3.123694436510979e-07, + "logits/chosen": -0.19697576761245728, + "logits/rejected": -0.03812349587678909, + "logps/chosen": -2.7251546382904053, + "logps/rejected": -3.6232516765594482, + "loss": 0.5101, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -2.7251546382904053, + "rewards/margins": 0.8980968594551086, + "rewards/rejected": -3.6232516765594482, + "sft_loss": 2.8783936500549316, + "step": 3700 + }, + { + "epoch": 1.982940291018565, + "grad_norm": 17.557722448319616, + "learning_rate": 3.1092676983940946e-07, + "logits/chosen": -0.2563186287879944, + "logits/rejected": -0.16103769838809967, + "logps/chosen": -2.7463276386260986, + "logps/rejected": -3.7818381786346436, + "loss": 0.4905, + "rewards/accuracies": 0.7562500238418579, + "rewards/chosen": -2.7463276386260986, + "rewards/margins": 1.035510540008545, + "rewards/rejected": -3.7818381786346436, + "sft_loss": 2.89477801322937, + "step": 3705 + }, + { + "epoch": 1.9856163237999667, + "grad_norm": 16.39475902018491, + "learning_rate": 3.094859304170293e-07, + "logits/chosen": -0.06748291850090027, + "logits/rejected": -0.017182841897010803, + "logps/chosen": -2.8282206058502197, + "logps/rejected": -3.6515190601348877, + "loss": 0.5719, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -2.8282206058502197, + "rewards/margins": 0.823298454284668, + "rewards/rejected": -3.6515190601348877, + "sft_loss": 3.028571367263794, + "step": 3710 + }, + { + "epoch": 1.9882923565813682, + "grad_norm": 15.992315690449052, + "learning_rate": 3.0804693936298795e-07, + "logits/chosen": -0.17725564539432526, + "logits/rejected": -0.10359780490398407, + "logps/chosen": -2.8508968353271484, + "logps/rejected": -3.920119047164917, + "loss": 0.4938, + "rewards/accuracies": 0.768750011920929, + "rewards/chosen": -2.8508968353271484, + "rewards/margins": 1.069222331047058, + "rewards/rejected": -3.920119047164917, + "sft_loss": 3.0350348949432373, + "step": 3715 + }, + { + "epoch": 1.9909683893627697, + "grad_norm": 14.889695737319636, + "learning_rate": 3.066098106383826e-07, + "logits/chosen": -0.24103884398937225, + "logits/rejected": -0.11680476367473602, + "logps/chosen": -2.8061740398406982, + "logps/rejected": -3.612884521484375, + "loss": 0.5453, + "rewards/accuracies": 0.7562500238418579, + "rewards/chosen": -2.8061740398406982, + "rewards/margins": 0.806710422039032, + "rewards/rejected": -3.612884521484375, + "sft_loss": 2.8877646923065186, + "step": 3720 + }, + { + "epoch": 1.9936444221441714, + "grad_norm": 14.487555363306377, + "learning_rate": 3.0517455818624263e-07, + "logits/chosen": -0.29338186979293823, + "logits/rejected": -0.17560279369354248, + "logps/chosen": -2.8024487495422363, + "logps/rejected": -3.7356905937194824, + "loss": 0.5019, + "rewards/accuracies": 0.7562500238418579, + "rewards/chosen": -2.8024487495422363, + "rewards/margins": 0.9332420229911804, + "rewards/rejected": -3.7356905937194824, + "sft_loss": 3.047268867492676, + "step": 3725 + }, + { + "epoch": 1.9963204549255729, + "grad_norm": 13.336958480256456, + "learning_rate": 3.037411959313936e-07, + "logits/chosen": -0.2274896800518036, + "logits/rejected": -0.06707743555307388, + "logps/chosen": -2.781489849090576, + "logps/rejected": -3.660910129547119, + "loss": 0.4864, + "rewards/accuracies": 0.768750011920929, + "rewards/chosen": -2.781489849090576, + "rewards/margins": 0.8794196844100952, + "rewards/rejected": -3.660910129547119, + "sft_loss": 2.9758944511413574, + "step": 3730 + }, + { + "epoch": 1.9989964877069744, + "grad_norm": 16.98708079592768, + "learning_rate": 3.023097377803224e-07, + "logits/chosen": -0.14046138525009155, + "logits/rejected": -0.0367421992123127, + "logps/chosen": -2.9978785514831543, + "logps/rejected": -3.839390993118286, + "loss": 0.5814, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -2.9978785514831543, + "rewards/margins": 0.8415123820304871, + "rewards/rejected": -3.839390993118286, + "sft_loss": 3.11506724357605, + "step": 3735 + }, + { + "epoch": 2.001672520488376, + "grad_norm": 14.889260074472366, + "learning_rate": 3.008801976210423e-07, + "logits/chosen": -0.1395779699087143, + "logits/rejected": -0.06194404885172844, + "logps/chosen": -3.1212987899780273, + "logps/rejected": -3.968535900115967, + "loss": 0.5346, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -3.1212987899780273, + "rewards/margins": 0.8472372889518738, + "rewards/rejected": -3.968535900115967, + "sft_loss": 3.2119712829589844, + "step": 3740 + }, + { + "epoch": 2.0043485532697773, + "grad_norm": 14.158552312571112, + "learning_rate": 2.994525893229581e-07, + "logits/chosen": -0.16429729759693146, + "logits/rejected": -0.05637521669268608, + "logps/chosen": -2.970836877822876, + "logps/rejected": -4.122104167938232, + "loss": 0.4368, + "rewards/accuracies": 0.831250011920929, + "rewards/chosen": -2.970836877822876, + "rewards/margins": 1.1512675285339355, + "rewards/rejected": -4.122104167938232, + "sft_loss": 3.106628894805908, + "step": 3745 + }, + { + "epoch": 2.007024586051179, + "grad_norm": 13.97493978051257, + "learning_rate": 2.98026926736732e-07, + "logits/chosen": -0.24882233142852783, + "logits/rejected": -0.1182834729552269, + "logps/chosen": -2.7795872688293457, + "logps/rejected": -3.8998725414276123, + "loss": 0.4589, + "rewards/accuracies": 0.8187500238418579, + "rewards/chosen": -2.7795872688293457, + "rewards/margins": 1.1202853918075562, + "rewards/rejected": -3.8998725414276123, + "sft_loss": 3.035322427749634, + "step": 3750 + }, + { + "epoch": 2.0097006188325808, + "grad_norm": 14.726312849685929, + "learning_rate": 2.9660322369414846e-07, + "logits/chosen": -0.24929046630859375, + "logits/rejected": -0.0745077133178711, + "logps/chosen": -2.9807448387145996, + "logps/rejected": -4.177748203277588, + "loss": 0.448, + "rewards/accuracies": 0.831250011920929, + "rewards/chosen": -2.9807448387145996, + "rewards/margins": 1.197003722190857, + "rewards/rejected": -4.177748203277588, + "sft_loss": 3.2656562328338623, + "step": 3755 + }, + { + "epoch": 2.0123766516139825, + "grad_norm": 12.955406923867912, + "learning_rate": 2.9518149400798063e-07, + "logits/chosen": -0.27042073011398315, + "logits/rejected": -0.1892492026090622, + "logps/chosen": -3.0288338661193848, + "logps/rejected": -4.288145065307617, + "loss": 0.4451, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -3.0288338661193848, + "rewards/margins": 1.259311318397522, + "rewards/rejected": -4.288145065307617, + "sft_loss": 3.255460739135742, + "step": 3760 + }, + { + "epoch": 2.0150526843953838, + "grad_norm": 21.026865395750054, + "learning_rate": 2.9376175147185633e-07, + "logits/chosen": -0.22757235169410706, + "logits/rejected": 0.01834225468337536, + "logps/chosen": -3.2413315773010254, + "logps/rejected": -4.376957416534424, + "loss": 0.5095, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -3.2413315773010254, + "rewards/margins": 1.1356260776519775, + "rewards/rejected": -4.376957416534424, + "sft_loss": 3.3733558654785156, + "step": 3765 + }, + { + "epoch": 2.0177287171767855, + "grad_norm": 23.06269740719491, + "learning_rate": 2.9234400986012376e-07, + "logits/chosen": -0.3123210072517395, + "logits/rejected": -0.10822536796331406, + "logps/chosen": -3.0676515102386475, + "logps/rejected": -4.425958156585693, + "loss": 0.4553, + "rewards/accuracies": 0.793749988079071, + "rewards/chosen": -3.0676515102386475, + "rewards/margins": 1.3583061695098877, + "rewards/rejected": -4.425958156585693, + "sft_loss": 3.320253849029541, + "step": 3770 + }, + { + "epoch": 2.020404749958187, + "grad_norm": 19.322245740661582, + "learning_rate": 2.9092828292771817e-07, + "logits/chosen": -0.18818703293800354, + "logits/rejected": -0.12292011082172394, + "logps/chosen": -3.184173583984375, + "logps/rejected": -4.315966606140137, + "loss": 0.476, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -3.184173583984375, + "rewards/margins": 1.1317930221557617, + "rewards/rejected": -4.315966606140137, + "sft_loss": 3.3012537956237793, + "step": 3775 + }, + { + "epoch": 2.0230807827395885, + "grad_norm": 13.414764005285019, + "learning_rate": 2.8951458441002875e-07, + "logits/chosen": -0.22242359817028046, + "logits/rejected": -0.1718529760837555, + "logps/chosen": -3.073848009109497, + "logps/rejected": -4.259646415710449, + "loss": 0.4569, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -3.073848009109497, + "rewards/margins": 1.1857987642288208, + "rewards/rejected": -4.259646415710449, + "sft_loss": 3.235039472579956, + "step": 3780 + }, + { + "epoch": 2.02575681552099, + "grad_norm": 12.06602753174118, + "learning_rate": 2.881029280227643e-07, + "logits/chosen": -0.2631004750728607, + "logits/rejected": -0.06374208629131317, + "logps/chosen": -3.1589930057525635, + "logps/rejected": -4.320956230163574, + "loss": 0.4747, + "rewards/accuracies": 0.8187500238418579, + "rewards/chosen": -3.1589930057525635, + "rewards/margins": 1.1619632244110107, + "rewards/rejected": -4.320956230163574, + "sft_loss": 3.288616895675659, + "step": 3785 + }, + { + "epoch": 2.028432848302392, + "grad_norm": 11.64879865176554, + "learning_rate": 2.8669332746182177e-07, + "logits/chosen": -0.3327762186527252, + "logits/rejected": -0.13205161690711975, + "logps/chosen": -3.0247673988342285, + "logps/rejected": -4.253329277038574, + "loss": 0.4586, + "rewards/accuracies": 0.78125, + "rewards/chosen": -3.0247673988342285, + "rewards/margins": 1.2285616397857666, + "rewards/rejected": -4.253329277038574, + "sft_loss": 3.222275972366333, + "step": 3790 + }, + { + "epoch": 2.031108881083793, + "grad_norm": 15.875083033580784, + "learning_rate": 2.8528579640315156e-07, + "logits/chosen": -0.20314481854438782, + "logits/rejected": -0.16578742861747742, + "logps/chosen": -2.9212849140167236, + "logps/rejected": -3.900330066680908, + "loss": 0.5052, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -2.9212849140167236, + "rewards/margins": 0.979045033454895, + "rewards/rejected": -3.900330066680908, + "sft_loss": 3.1145448684692383, + "step": 3795 + }, + { + "epoch": 2.033784913865195, + "grad_norm": 23.02598168268979, + "learning_rate": 2.8388034850262646e-07, + "logits/chosen": -0.23519723117351532, + "logits/rejected": -0.06606370955705643, + "logps/chosen": -3.071678876876831, + "logps/rejected": -4.214540958404541, + "loss": 0.4638, + "rewards/accuracies": 0.793749988079071, + "rewards/chosen": -3.071678876876831, + "rewards/margins": 1.142861247062683, + "rewards/rejected": -4.214540958404541, + "sft_loss": 3.3099563121795654, + "step": 3800 + }, + { + "epoch": 2.0364609466465966, + "grad_norm": 19.864249915483093, + "learning_rate": 2.824769973959079e-07, + "logits/chosen": -0.25034523010253906, + "logits/rejected": -0.08123587816953659, + "logps/chosen": -2.9977831840515137, + "logps/rejected": -4.137685298919678, + "loss": 0.4488, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -2.9977831840515137, + "rewards/margins": 1.1399024724960327, + "rewards/rejected": -4.137685298919678, + "sft_loss": 3.1640961170196533, + "step": 3805 + }, + { + "epoch": 2.039136979427998, + "grad_norm": 15.379477022203476, + "learning_rate": 2.81075756698315e-07, + "logits/chosen": -0.13126808404922485, + "logits/rejected": -0.017083149403333664, + "logps/chosen": -2.9781837463378906, + "logps/rejected": -4.2689385414123535, + "loss": 0.3996, + "rewards/accuracies": 0.8687499761581421, + "rewards/chosen": -2.9781837463378906, + "rewards/margins": 1.290755033493042, + "rewards/rejected": -4.2689385414123535, + "sft_loss": 3.0259854793548584, + "step": 3810 + }, + { + "epoch": 2.0418130122093996, + "grad_norm": 14.558688116421909, + "learning_rate": 2.7967664000469035e-07, + "logits/chosen": -0.31151649355888367, + "logits/rejected": -0.16419212520122528, + "logps/chosen": -3.105769634246826, + "logps/rejected": -4.2047200202941895, + "loss": 0.4558, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -3.105769634246826, + "rewards/margins": 1.098949909210205, + "rewards/rejected": -4.2047200202941895, + "sft_loss": 3.147165536880493, + "step": 3815 + }, + { + "epoch": 2.0444890449908013, + "grad_norm": 16.17521936665668, + "learning_rate": 2.7827966088927095e-07, + "logits/chosen": -0.33099812269210815, + "logits/rejected": -0.04387823864817619, + "logps/chosen": -3.1706812381744385, + "logps/rejected": -4.422829627990723, + "loss": 0.4525, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -3.1706812381744385, + "rewards/margins": 1.2521488666534424, + "rewards/rejected": -4.422829627990723, + "sft_loss": 3.3813254833221436, + "step": 3820 + }, + { + "epoch": 2.0471650777722026, + "grad_norm": 16.53156057156711, + "learning_rate": 2.768848329055538e-07, + "logits/chosen": -0.20927992463111877, + "logits/rejected": -0.11633219569921494, + "logps/chosen": -3.0672831535339355, + "logps/rejected": -4.292285919189453, + "loss": 0.4382, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -3.0672831535339355, + "rewards/margins": 1.225002646446228, + "rewards/rejected": -4.292285919189453, + "sft_loss": 3.2913658618927, + "step": 3825 + }, + { + "epoch": 2.0498411105536043, + "grad_norm": 19.536733489255592, + "learning_rate": 2.7549216958616657e-07, + "logits/chosen": -0.330092191696167, + "logits/rejected": -0.1299157738685608, + "logps/chosen": -3.3392879962921143, + "logps/rejected": -4.565664291381836, + "loss": 0.4668, + "rewards/accuracies": 0.8062499761581421, + "rewards/chosen": -3.3392879962921143, + "rewards/margins": 1.2263761758804321, + "rewards/rejected": -4.565664291381836, + "sft_loss": 3.4869301319122314, + "step": 3830 + }, + { + "epoch": 2.052517143335006, + "grad_norm": 14.002002293057318, + "learning_rate": 2.741016844427344e-07, + "logits/chosen": -0.22635014355182648, + "logits/rejected": -0.006752826273441315, + "logps/chosen": -3.167531728744507, + "logps/rejected": -4.436515808105469, + "loss": 0.4436, + "rewards/accuracies": 0.8374999761581421, + "rewards/chosen": -3.167531728744507, + "rewards/margins": 1.2689836025238037, + "rewards/rejected": -4.436515808105469, + "sft_loss": 3.422799587249756, + "step": 3835 + }, + { + "epoch": 2.0551931761164073, + "grad_norm": 14.720630160755997, + "learning_rate": 2.7271339096575073e-07, + "logits/chosen": -0.17786608636379242, + "logits/rejected": -0.00022650808386970311, + "logps/chosen": -3.0252976417541504, + "logps/rejected": -4.256524562835693, + "loss": 0.4593, + "rewards/accuracies": 0.8125, + "rewards/chosen": -3.0252976417541504, + "rewards/margins": 1.2312266826629639, + "rewards/rejected": -4.256524562835693, + "sft_loss": 3.2365882396698, + "step": 3840 + }, + { + "epoch": 2.057869208897809, + "grad_norm": 14.236715915279673, + "learning_rate": 2.713273026244446e-07, + "logits/chosen": -0.3513855040073395, + "logits/rejected": -0.06968530267477036, + "logps/chosen": -3.218956470489502, + "logps/rejected": -4.501593112945557, + "loss": 0.4086, + "rewards/accuracies": 0.8374999761581421, + "rewards/chosen": -3.218956470489502, + "rewards/margins": 1.2826364040374756, + "rewards/rejected": -4.501593112945557, + "sft_loss": 3.32515287399292, + "step": 3845 + }, + { + "epoch": 2.0605452416792107, + "grad_norm": 17.603244982199804, + "learning_rate": 2.6994343286665156e-07, + "logits/chosen": -0.2870510220527649, + "logits/rejected": -0.05223512649536133, + "logps/chosen": -3.2820980548858643, + "logps/rejected": -4.297463893890381, + "loss": 0.507, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -3.2820980548858643, + "rewards/margins": 1.0153658390045166, + "rewards/rejected": -4.297463893890381, + "sft_loss": 3.4639363288879395, + "step": 3850 + }, + { + "epoch": 2.063221274460612, + "grad_norm": 18.77331919350168, + "learning_rate": 2.6856179511868156e-07, + "logits/chosen": -0.21658697724342346, + "logits/rejected": 0.019917303696274757, + "logps/chosen": -3.235149383544922, + "logps/rejected": -4.659626007080078, + "loss": 0.4438, + "rewards/accuracies": 0.793749988079071, + "rewards/chosen": -3.235149383544922, + "rewards/margins": 1.4244766235351562, + "rewards/rejected": -4.659626007080078, + "sft_loss": 3.3648905754089355, + "step": 3855 + }, + { + "epoch": 2.0658973072420137, + "grad_norm": 15.47100105468003, + "learning_rate": 2.6718240278519056e-07, + "logits/chosen": -0.18325772881507874, + "logits/rejected": -0.0073938206769526005, + "logps/chosen": -3.246943950653076, + "logps/rejected": -4.498908042907715, + "loss": 0.4531, + "rewards/accuracies": 0.84375, + "rewards/chosen": -3.246943950653076, + "rewards/margins": 1.2519642114639282, + "rewards/rejected": -4.498908042907715, + "sft_loss": 3.366358995437622, + "step": 3860 + }, + { + "epoch": 2.0685733400234154, + "grad_norm": 19.67939759305146, + "learning_rate": 2.6580526924904866e-07, + "logits/chosen": -0.3614625930786133, + "logits/rejected": -0.16604101657867432, + "logps/chosen": -3.1864423751831055, + "logps/rejected": -4.37761926651001, + "loss": 0.4572, + "rewards/accuracies": 0.8125, + "rewards/chosen": -3.1864423751831055, + "rewards/margins": 1.1911770105361938, + "rewards/rejected": -4.37761926651001, + "sft_loss": 3.3482697010040283, + "step": 3865 + }, + { + "epoch": 2.0712493728048167, + "grad_norm": 16.813465827880535, + "learning_rate": 2.6443040787121186e-07, + "logits/chosen": -0.34609368443489075, + "logits/rejected": -0.20742973685264587, + "logps/chosen": -3.072408676147461, + "logps/rejected": -4.248513221740723, + "loss": 0.4607, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -3.072408676147461, + "rewards/margins": 1.1761040687561035, + "rewards/rejected": -4.248513221740723, + "sft_loss": 3.3169169425964355, + "step": 3870 + }, + { + "epoch": 2.0739254055862184, + "grad_norm": 15.706136306241692, + "learning_rate": 2.6305783199059084e-07, + "logits/chosen": -0.2473127543926239, + "logits/rejected": -0.11334657669067383, + "logps/chosen": -3.205875873565674, + "logps/rejected": -4.373950004577637, + "loss": 0.4714, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -3.205875873565674, + "rewards/margins": 1.168074131011963, + "rewards/rejected": -4.373950004577637, + "sft_loss": 3.4339394569396973, + "step": 3875 + }, + { + "epoch": 2.07660143836762, + "grad_norm": 18.912546324244307, + "learning_rate": 2.6168755492392324e-07, + "logits/chosen": -0.3082582652568817, + "logits/rejected": -0.09416376054286957, + "logps/chosen": -2.921729564666748, + "logps/rejected": -4.220498085021973, + "loss": 0.4275, + "rewards/accuracies": 0.84375, + "rewards/chosen": -2.921729564666748, + "rewards/margins": 1.2987688779830933, + "rewards/rejected": -4.220498085021973, + "sft_loss": 3.0420355796813965, + "step": 3880 + }, + { + "epoch": 2.0792774711490214, + "grad_norm": 16.556211849700116, + "learning_rate": 2.6031958996564274e-07, + "logits/chosen": -0.29506513476371765, + "logits/rejected": -0.14849936962127686, + "logps/chosen": -2.925961971282959, + "logps/rejected": -4.301008224487305, + "loss": 0.4222, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -2.925961971282959, + "rewards/margins": 1.3750462532043457, + "rewards/rejected": -4.301008224487305, + "sft_loss": 3.147723913192749, + "step": 3885 + }, + { + "epoch": 2.081953503930423, + "grad_norm": 17.664433888001273, + "learning_rate": 2.589539503877518e-07, + "logits/chosen": -0.21779341995716095, + "logits/rejected": -0.10009002685546875, + "logps/chosen": -3.122642755508423, + "logps/rejected": -4.227787971496582, + "loss": 0.4997, + "rewards/accuracies": 0.768750011920929, + "rewards/chosen": -3.122642755508423, + "rewards/margins": 1.1051450967788696, + "rewards/rejected": -4.227787971496582, + "sft_loss": 3.3117828369140625, + "step": 3890 + }, + { + "epoch": 2.084629536711825, + "grad_norm": 13.11518902003022, + "learning_rate": 2.5759064943969125e-07, + "logits/chosen": -0.29696202278137207, + "logits/rejected": -0.024793457239866257, + "logps/chosen": -3.1317408084869385, + "logps/rejected": -4.314787864685059, + "loss": 0.4773, + "rewards/accuracies": 0.793749988079071, + "rewards/chosen": -3.1317408084869385, + "rewards/margins": 1.1830475330352783, + "rewards/rejected": -4.314787864685059, + "sft_loss": 3.2217929363250732, + "step": 3895 + }, + { + "epoch": 2.087305569493226, + "grad_norm": 14.38161932062421, + "learning_rate": 2.562297003482131e-07, + "logits/chosen": -0.19218644499778748, + "logits/rejected": -0.13539327681064606, + "logps/chosen": -3.163686752319336, + "logps/rejected": -4.396138668060303, + "loss": 0.4345, + "rewards/accuracies": 0.8187500238418579, + "rewards/chosen": -3.163686752319336, + "rewards/margins": 1.2324519157409668, + "rewards/rejected": -4.396138668060303, + "sft_loss": 3.3125, + "step": 3900 + }, + { + "epoch": 2.089981602274628, + "grad_norm": 17.436176806374725, + "learning_rate": 2.548711163172512e-07, + "logits/chosen": -0.21015885472297668, + "logits/rejected": -0.09632865339517593, + "logps/chosen": -3.2756333351135254, + "logps/rejected": -4.456031799316406, + "loss": 0.4933, + "rewards/accuracies": 0.78125, + "rewards/chosen": -3.2756333351135254, + "rewards/margins": 1.180398941040039, + "rewards/rejected": -4.456031799316406, + "sft_loss": 3.378589630126953, + "step": 3905 + }, + { + "epoch": 2.0926576350560295, + "grad_norm": 16.21246166612964, + "learning_rate": 2.53514910527794e-07, + "logits/chosen": -0.2180435210466385, + "logits/rejected": -0.05177057906985283, + "logps/chosen": -3.026956081390381, + "logps/rejected": -4.208635330200195, + "loss": 0.445, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -3.026956081390381, + "rewards/margins": 1.181679129600525, + "rewards/rejected": -4.208635330200195, + "sft_loss": 3.2399840354919434, + "step": 3910 + }, + { + "epoch": 2.095333667837431, + "grad_norm": 21.6260859408156, + "learning_rate": 2.5216109613775573e-07, + "logits/chosen": -0.2627881169319153, + "logits/rejected": -0.0699787437915802, + "logps/chosen": -3.3080992698669434, + "logps/rejected": -4.408918380737305, + "loss": 0.5084, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -3.3080992698669434, + "rewards/margins": 1.1008193492889404, + "rewards/rejected": -4.408918380737305, + "sft_loss": 3.465599536895752, + "step": 3915 + }, + { + "epoch": 2.0980097006188325, + "grad_norm": 15.82214573668947, + "learning_rate": 2.5080968628184993e-07, + "logits/chosen": -0.24287445843219757, + "logits/rejected": -0.05494864657521248, + "logps/chosen": -3.1612963676452637, + "logps/rejected": -4.5922040939331055, + "loss": 0.4206, + "rewards/accuracies": 0.8125, + "rewards/chosen": -3.1612963676452637, + "rewards/margins": 1.4309074878692627, + "rewards/rejected": -4.5922040939331055, + "sft_loss": 3.247044324874878, + "step": 3920 + }, + { + "epoch": 2.1006857334002342, + "grad_norm": 14.561044409130913, + "learning_rate": 2.494606940714605e-07, + "logits/chosen": -0.23816195130348206, + "logits/rejected": -0.12281879037618637, + "logps/chosen": -3.0465781688690186, + "logps/rejected": -4.356485843658447, + "loss": 0.4415, + "rewards/accuracies": 0.8374999761581421, + "rewards/chosen": -3.0465781688690186, + "rewards/margins": 1.30990731716156, + "rewards/rejected": -4.356485843658447, + "sft_loss": 3.24699068069458, + "step": 3925 + }, + { + "epoch": 2.103361766181636, + "grad_norm": 13.12575769056934, + "learning_rate": 2.4811413259451625e-07, + "logits/chosen": -0.3366604447364807, + "logits/rejected": -0.11346001923084259, + "logps/chosen": -3.1869969367980957, + "logps/rejected": -4.55192756652832, + "loss": 0.4457, + "rewards/accuracies": 0.793749988079071, + "rewards/chosen": -3.1869969367980957, + "rewards/margins": 1.3649308681488037, + "rewards/rejected": -4.55192756652832, + "sft_loss": 3.3118603229522705, + "step": 3930 + }, + { + "epoch": 2.106037798963037, + "grad_norm": 14.285361906378304, + "learning_rate": 2.46770014915362e-07, + "logits/chosen": -0.25430670380592346, + "logits/rejected": -0.12929832935333252, + "logps/chosen": -3.152146577835083, + "logps/rejected": -4.497963905334473, + "loss": 0.4376, + "rewards/accuracies": 0.856249988079071, + "rewards/chosen": -3.152146577835083, + "rewards/margins": 1.3458168506622314, + "rewards/rejected": -4.497963905334473, + "sft_loss": 3.266214370727539, + "step": 3935 + }, + { + "epoch": 2.108713831744439, + "grad_norm": 25.152286266240516, + "learning_rate": 2.45428354074634e-07, + "logits/chosen": -0.2446119487285614, + "logits/rejected": -0.1413499414920807, + "logps/chosen": -3.1099953651428223, + "logps/rejected": -4.482461929321289, + "loss": 0.4381, + "rewards/accuracies": 0.8062499761581421, + "rewards/chosen": -3.1099953651428223, + "rewards/margins": 1.3724663257598877, + "rewards/rejected": -4.482461929321289, + "sft_loss": 3.1997177600860596, + "step": 3940 + }, + { + "epoch": 2.1113898645258407, + "grad_norm": 19.556528330778583, + "learning_rate": 2.4408916308913105e-07, + "logits/chosen": -0.28477686643600464, + "logits/rejected": -0.06802688539028168, + "logps/chosen": -3.315284013748169, + "logps/rejected": -4.377490043640137, + "loss": 0.5111, + "rewards/accuracies": 0.75, + "rewards/chosen": -3.315284013748169, + "rewards/margins": 1.0622055530548096, + "rewards/rejected": -4.377490043640137, + "sft_loss": 3.5316920280456543, + "step": 3945 + }, + { + "epoch": 2.114065897307242, + "grad_norm": 26.935194574649227, + "learning_rate": 2.4275245495169025e-07, + "logits/chosen": -0.18549111485481262, + "logits/rejected": 0.003963217604905367, + "logps/chosen": -3.165619373321533, + "logps/rejected": -4.436346054077148, + "loss": 0.4532, + "rewards/accuracies": 0.831250011920929, + "rewards/chosen": -3.165619373321533, + "rewards/margins": 1.2707264423370361, + "rewards/rejected": -4.436346054077148, + "sft_loss": 3.3000197410583496, + "step": 3950 + }, + { + "epoch": 2.1167419300886436, + "grad_norm": 18.550851560974007, + "learning_rate": 2.414182426310597e-07, + "logits/chosen": -0.28685250878334045, + "logits/rejected": -0.19694644212722778, + "logps/chosen": -3.164374828338623, + "logps/rejected": -4.608752250671387, + "loss": 0.4438, + "rewards/accuracies": 0.8062499761581421, + "rewards/chosen": -3.164374828338623, + "rewards/margins": 1.444377064704895, + "rewards/rejected": -4.608752250671387, + "sft_loss": 3.336451292037964, + "step": 3955 + }, + { + "epoch": 2.1194179628700454, + "grad_norm": 14.200310042989717, + "learning_rate": 2.400865390717734e-07, + "logits/chosen": -0.22424855828285217, + "logits/rejected": -0.09154853969812393, + "logps/chosen": -3.138336658477783, + "logps/rejected": -4.685622215270996, + "loss": 0.4002, + "rewards/accuracies": 0.8374999761581421, + "rewards/chosen": -3.138336658477783, + "rewards/margins": 1.5472863912582397, + "rewards/rejected": -4.685622215270996, + "sft_loss": 3.3620047569274902, + "step": 3960 + }, + { + "epoch": 2.1220939956514466, + "grad_norm": 20.604088367208963, + "learning_rate": 2.3875735719402475e-07, + "logits/chosen": -0.24657312035560608, + "logits/rejected": -0.06659922748804092, + "logps/chosen": -3.314816951751709, + "logps/rejected": -4.729458808898926, + "loss": 0.4234, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -3.314816951751709, + "rewards/margins": 1.414642333984375, + "rewards/rejected": -4.729458808898926, + "sft_loss": 3.615452289581299, + "step": 3965 + }, + { + "epoch": 2.1247700284328483, + "grad_norm": 16.19776372970058, + "learning_rate": 2.3743070989354258e-07, + "logits/chosen": -0.2126585692167282, + "logits/rejected": -0.06898088753223419, + "logps/chosen": -3.199605703353882, + "logps/rejected": -4.575144290924072, + "loss": 0.4858, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": -3.199605703353882, + "rewards/margins": 1.3755385875701904, + "rewards/rejected": -4.575144290924072, + "sft_loss": 3.5406627655029297, + "step": 3970 + }, + { + "epoch": 2.12744606121425, + "grad_norm": 15.578792534141156, + "learning_rate": 2.3610661004146454e-07, + "logits/chosen": -0.19905905425548553, + "logits/rejected": -0.027686545625329018, + "logps/chosen": -3.0088601112365723, + "logps/rejected": -4.292378902435303, + "loss": 0.42, + "rewards/accuracies": 0.862500011920929, + "rewards/chosen": -3.0088601112365723, + "rewards/margins": 1.2835190296173096, + "rewards/rejected": -4.292378902435303, + "sft_loss": 3.179039478302002, + "step": 3975 + }, + { + "epoch": 2.1301220939956513, + "grad_norm": 17.082743353736696, + "learning_rate": 2.3478507048421314e-07, + "logits/chosen": -0.26677241921424866, + "logits/rejected": -0.1466524749994278, + "logps/chosen": -2.986079692840576, + "logps/rejected": -4.280380725860596, + "loss": 0.4531, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -2.986079692840576, + "rewards/margins": 1.2943007946014404, + "rewards/rejected": -4.280380725860596, + "sft_loss": 3.35664439201355, + "step": 3980 + }, + { + "epoch": 2.132798126777053, + "grad_norm": 24.458290951910765, + "learning_rate": 2.334661040433713e-07, + "logits/chosen": -0.34359875321388245, + "logits/rejected": -0.19697892665863037, + "logps/chosen": -3.104626178741455, + "logps/rejected": -4.322833061218262, + "loss": 0.4684, + "rewards/accuracies": 0.793749988079071, + "rewards/chosen": -3.104626178741455, + "rewards/margins": 1.218206763267517, + "rewards/rejected": -4.322833061218262, + "sft_loss": 3.375357151031494, + "step": 3985 + }, + { + "epoch": 2.1354741595584548, + "grad_norm": 15.926635139586462, + "learning_rate": 2.321497235155568e-07, + "logits/chosen": -0.3426477909088135, + "logits/rejected": -0.17477676272392273, + "logps/chosen": -3.032522678375244, + "logps/rejected": -4.347482204437256, + "loss": 0.4145, + "rewards/accuracies": 0.856249988079071, + "rewards/chosen": -3.032522678375244, + "rewards/margins": 1.3149597644805908, + "rewards/rejected": -4.347482204437256, + "sft_loss": 3.267840623855591, + "step": 3990 + }, + { + "epoch": 2.138150192339856, + "grad_norm": 21.33616482570865, + "learning_rate": 2.3083594167229965e-07, + "logits/chosen": -0.362566739320755, + "logits/rejected": -0.062093090265989304, + "logps/chosen": -3.2226223945617676, + "logps/rejected": -4.450702667236328, + "loss": 0.47, + "rewards/accuracies": 0.793749988079071, + "rewards/chosen": -3.2226223945617676, + "rewards/margins": 1.2280802726745605, + "rewards/rejected": -4.450702667236328, + "sft_loss": 3.400771379470825, + "step": 3995 + }, + { + "epoch": 2.1408262251212578, + "grad_norm": 24.458228224023852, + "learning_rate": 2.295247712599167e-07, + "logits/chosen": -0.2583773732185364, + "logits/rejected": -0.14837753772735596, + "logps/chosen": -3.1129956245422363, + "logps/rejected": -4.425230026245117, + "loss": 0.4702, + "rewards/accuracies": 0.75, + "rewards/chosen": -3.1129956245422363, + "rewards/margins": 1.3122342824935913, + "rewards/rejected": -4.425230026245117, + "sft_loss": 3.2709991931915283, + "step": 4000 + }, + { + "epoch": 2.1408262251212578, + "eval_logits/chosen": 0.11977552622556686, + "eval_logits/rejected": 0.23950588703155518, + "eval_logps/chosen": -3.3071038722991943, + "eval_logps/rejected": -4.328495025634766, + "eval_loss": 0.5591782927513123, + "eval_rewards/accuracies": 0.7240356206893921, + "eval_rewards/chosen": -3.3071038722991943, + "eval_rewards/margins": 1.0213911533355713, + "eval_rewards/rejected": -4.328495025634766, + "eval_runtime": 51.006, + "eval_samples_per_second": 26.369, + "eval_sft_loss": 3.511914014816284, + "eval_steps_per_second": 6.607, + "step": 4000 + }, + { + "epoch": 2.1435022579026595, + "grad_norm": 15.306638141971865, + "learning_rate": 2.2821622499938948e-07, + "logits/chosen": -0.23230516910552979, + "logits/rejected": 0.016825273633003235, + "logps/chosen": -3.3846182823181152, + "logps/rejected": -4.501579761505127, + "loss": 0.4927, + "rewards/accuracies": 0.78125, + "rewards/chosen": -3.3846182823181152, + "rewards/margins": 1.1169617176055908, + "rewards/rejected": -4.501579761505127, + "sft_loss": 3.4896881580352783, + "step": 4005 + }, + { + "epoch": 2.1461782906840607, + "grad_norm": 17.626588110370694, + "learning_rate": 2.269103155862391e-07, + "logits/chosen": -0.3005710244178772, + "logits/rejected": -0.16064947843551636, + "logps/chosen": -3.107633113861084, + "logps/rejected": -4.182787895202637, + "loss": 0.4992, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -3.107633113861084, + "rewards/margins": 1.0751547813415527, + "rewards/rejected": -4.182787895202637, + "sft_loss": 3.2684624195098877, + "step": 4010 + }, + { + "epoch": 2.1488543234654625, + "grad_norm": 14.643533459411545, + "learning_rate": 2.2560705569040483e-07, + "logits/chosen": -0.2704944908618927, + "logits/rejected": 0.012443220242857933, + "logps/chosen": -3.1479153633117676, + "logps/rejected": -4.254956245422363, + "loss": 0.5012, + "rewards/accuracies": 0.793749988079071, + "rewards/chosen": -3.1479153633117676, + "rewards/margins": 1.107041597366333, + "rewards/rejected": -4.254956245422363, + "sft_loss": 3.331916093826294, + "step": 4015 + }, + { + "epoch": 2.151530356246864, + "grad_norm": 13.935643009683375, + "learning_rate": 2.2430645795611963e-07, + "logits/chosen": -0.35706567764282227, + "logits/rejected": -0.17552462220191956, + "logps/chosen": -3.247520923614502, + "logps/rejected": -4.49759578704834, + "loss": 0.448, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -3.247520923614502, + "rewards/margins": 1.250075340270996, + "rewards/rejected": -4.49759578704834, + "sft_loss": 3.426701307296753, + "step": 4020 + }, + { + "epoch": 2.1542063890282654, + "grad_norm": 21.06771273172547, + "learning_rate": 2.230085350017884e-07, + "logits/chosen": -0.26404106616973877, + "logits/rejected": -0.12290849536657333, + "logps/chosen": -3.0938751697540283, + "logps/rejected": -4.20880651473999, + "loss": 0.4914, + "rewards/accuracies": 0.768750011920929, + "rewards/chosen": -3.0938751697540283, + "rewards/margins": 1.1149311065673828, + "rewards/rejected": -4.20880651473999, + "sft_loss": 3.2841713428497314, + "step": 4025 + }, + { + "epoch": 2.156882421809667, + "grad_norm": 13.90057948341053, + "learning_rate": 2.2171329941986554e-07, + "logits/chosen": -0.32073497772216797, + "logits/rejected": -0.19450710713863373, + "logps/chosen": -3.0455260276794434, + "logps/rejected": -4.355099201202393, + "loss": 0.3986, + "rewards/accuracies": 0.875, + "rewards/chosen": -3.0455260276794434, + "rewards/margins": 1.3095730543136597, + "rewards/rejected": -4.355099201202393, + "sft_loss": 3.1791176795959473, + "step": 4030 + }, + { + "epoch": 2.159558454591069, + "grad_norm": 17.86472232480832, + "learning_rate": 2.2042076377673202e-07, + "logits/chosen": -0.26653456687927246, + "logits/rejected": -0.24267368018627167, + "logps/chosen": -3.016118288040161, + "logps/rejected": -4.04874324798584, + "loss": 0.4991, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": -3.016118288040161, + "rewards/margins": 1.0326251983642578, + "rewards/rejected": -4.04874324798584, + "sft_loss": 3.2150261402130127, + "step": 4035 + }, + { + "epoch": 2.16223448737247, + "grad_norm": 17.784423087700567, + "learning_rate": 2.1913094061257476e-07, + "logits/chosen": -0.2587827444076538, + "logits/rejected": -0.23740462958812714, + "logps/chosen": -3.0443973541259766, + "logps/rejected": -4.254450798034668, + "loss": 0.433, + "rewards/accuracies": 0.831250011920929, + "rewards/chosen": -3.0443973541259766, + "rewards/margins": 1.2100534439086914, + "rewards/rejected": -4.254450798034668, + "sft_loss": 3.1775622367858887, + "step": 4040 + }, + { + "epoch": 2.164910520153872, + "grad_norm": 17.995001216291413, + "learning_rate": 2.178438424412633e-07, + "logits/chosen": -0.25195929408073425, + "logits/rejected": -0.09768708795309067, + "logps/chosen": -3.103498935699463, + "logps/rejected": -4.227923393249512, + "loss": 0.4917, + "rewards/accuracies": 0.75, + "rewards/chosen": -3.103498935699463, + "rewards/margins": 1.1244243383407593, + "rewards/rejected": -4.227923393249512, + "sft_loss": 3.284104108810425, + "step": 4045 + }, + { + "epoch": 2.1675865529352736, + "grad_norm": 22.167275790320975, + "learning_rate": 2.165594817502302e-07, + "logits/chosen": -0.32782620191574097, + "logits/rejected": -0.1630801260471344, + "logps/chosen": -3.380352020263672, + "logps/rejected": -4.39149808883667, + "loss": 0.521, + "rewards/accuracies": 0.768750011920929, + "rewards/chosen": -3.380352020263672, + "rewards/margins": 1.0111463069915771, + "rewards/rejected": -4.39149808883667, + "sft_loss": 3.6027674674987793, + "step": 4050 + }, + { + "epoch": 2.170262585716675, + "grad_norm": 18.15649918008833, + "learning_rate": 2.1527787100034806e-07, + "logits/chosen": -0.2120496779680252, + "logits/rejected": -0.1167217493057251, + "logps/chosen": -3.2311127185821533, + "logps/rejected": -4.212703227996826, + "loss": 0.4863, + "rewards/accuracies": 0.8125, + "rewards/chosen": -3.2311127185821533, + "rewards/margins": 0.9815902709960938, + "rewards/rejected": -4.212703227996826, + "sft_loss": 3.3510711193084717, + "step": 4055 + }, + { + "epoch": 2.1729386184980766, + "grad_norm": 17.64261135465353, + "learning_rate": 2.1399902262581037e-07, + "logits/chosen": -0.1881551444530487, + "logits/rejected": 0.0027097121346741915, + "logps/chosen": -3.259291887283325, + "logps/rejected": -4.351016044616699, + "loss": 0.5074, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -3.259291887283325, + "rewards/margins": 1.0917246341705322, + "rewards/rejected": -4.351016044616699, + "sft_loss": 3.5189929008483887, + "step": 4060 + }, + { + "epoch": 2.1756146512794783, + "grad_norm": 20.66924206249742, + "learning_rate": 2.127229490340094e-07, + "logits/chosen": -0.30917078256607056, + "logits/rejected": -0.19973380863666534, + "logps/chosen": -3.1926956176757812, + "logps/rejected": -4.505526065826416, + "loss": 0.4338, + "rewards/accuracies": 0.831250011920929, + "rewards/chosen": -3.1926956176757812, + "rewards/margins": 1.3128302097320557, + "rewards/rejected": -4.505526065826416, + "sft_loss": 3.4061808586120605, + "step": 4065 + }, + { + "epoch": 2.1782906840608796, + "grad_norm": 24.810042393157584, + "learning_rate": 2.1144966260541698e-07, + "logits/chosen": -0.2025396078824997, + "logits/rejected": 0.03697749227285385, + "logps/chosen": -3.2227625846862793, + "logps/rejected": -4.55974817276001, + "loss": 0.4911, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -3.2227625846862793, + "rewards/margins": 1.33698570728302, + "rewards/rejected": -4.55974817276001, + "sft_loss": 3.47778582572937, + "step": 4070 + }, + { + "epoch": 2.1809667168422813, + "grad_norm": 17.448281363095408, + "learning_rate": 2.1017917569346332e-07, + "logits/chosen": -0.2623835802078247, + "logits/rejected": -0.03134022280573845, + "logps/chosen": -3.1874072551727295, + "logps/rejected": -4.3760247230529785, + "loss": 0.4427, + "rewards/accuracies": 0.8187500238418579, + "rewards/chosen": -3.1874072551727295, + "rewards/margins": 1.188617467880249, + "rewards/rejected": -4.3760247230529785, + "sft_loss": 3.296792984008789, + "step": 4075 + }, + { + "epoch": 2.183642749623683, + "grad_norm": 16.51718107754312, + "learning_rate": 2.0891150062441837e-07, + "logits/chosen": -0.30696502327919006, + "logits/rejected": -0.14113807678222656, + "logps/chosen": -3.2767624855041504, + "logps/rejected": -4.557929039001465, + "loss": 0.4563, + "rewards/accuracies": 0.8125, + "rewards/chosen": -3.2767624855041504, + "rewards/margins": 1.2811663150787354, + "rewards/rejected": -4.557929039001465, + "sft_loss": 3.394482135772705, + "step": 4080 + }, + { + "epoch": 2.1863187824050843, + "grad_norm": 14.918701464472884, + "learning_rate": 2.0764664969727086e-07, + "logits/chosen": -0.22599594295024872, + "logits/rejected": -0.140573650598526, + "logps/chosen": -3.057241678237915, + "logps/rejected": -4.294919013977051, + "loss": 0.4291, + "rewards/accuracies": 0.8374999761581421, + "rewards/chosen": -3.057241678237915, + "rewards/margins": 1.2376779317855835, + "rewards/rejected": -4.294919013977051, + "sft_loss": 3.1836822032928467, + "step": 4085 + }, + { + "epoch": 2.188994815186486, + "grad_norm": 18.39548959285876, + "learning_rate": 2.0638463518361033e-07, + "logits/chosen": -0.3747314214706421, + "logits/rejected": -0.12570782005786896, + "logps/chosen": -3.098694324493408, + "logps/rejected": -4.325566291809082, + "loss": 0.4412, + "rewards/accuracies": 0.8187500238418579, + "rewards/chosen": -3.098694324493408, + "rewards/margins": 1.2268718481063843, + "rewards/rejected": -4.325566291809082, + "sft_loss": 3.2901692390441895, + "step": 4090 + }, + { + "epoch": 2.1916708479678877, + "grad_norm": 22.101033579028414, + "learning_rate": 2.0512546932750702e-07, + "logits/chosen": -0.29583343863487244, + "logits/rejected": -0.16991981863975525, + "logps/chosen": -3.3528189659118652, + "logps/rejected": -4.421876907348633, + "loss": 0.487, + "rewards/accuracies": 0.8187500238418579, + "rewards/chosen": -3.3528189659118652, + "rewards/margins": 1.0690577030181885, + "rewards/rejected": -4.421876907348633, + "sft_loss": 3.499919891357422, + "step": 4095 + }, + { + "epoch": 2.194346880749289, + "grad_norm": 19.77436652933417, + "learning_rate": 2.0386916434539343e-07, + "logits/chosen": -0.230976864695549, + "logits/rejected": -0.025323525071144104, + "logps/chosen": -3.058785915374756, + "logps/rejected": -4.476016044616699, + "loss": 0.3986, + "rewards/accuracies": 0.862500011920929, + "rewards/chosen": -3.058785915374756, + "rewards/margins": 1.4172298908233643, + "rewards/rejected": -4.476016044616699, + "sft_loss": 3.337048053741455, + "step": 4100 + }, + { + "epoch": 2.1970229135306907, + "grad_norm": 17.634780116643828, + "learning_rate": 2.0261573242594627e-07, + "logits/chosen": -0.2085772007703781, + "logits/rejected": 0.01965431496500969, + "logps/chosen": -3.4317848682403564, + "logps/rejected": -4.651656150817871, + "loss": 0.4657, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -3.4317848682403564, + "rewards/margins": 1.219870924949646, + "rewards/rejected": -4.651656150817871, + "sft_loss": 3.484034776687622, + "step": 4105 + }, + { + "epoch": 2.1996989463120924, + "grad_norm": 27.154747986305658, + "learning_rate": 2.0136518572996724e-07, + "logits/chosen": -0.23308996856212616, + "logits/rejected": 0.0261714868247509, + "logps/chosen": -3.182469129562378, + "logps/rejected": -4.524494171142578, + "loss": 0.443, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -3.182469129562378, + "rewards/margins": 1.342024803161621, + "rewards/rejected": -4.524494171142578, + "sft_loss": 3.384852170944214, + "step": 4110 + }, + { + "epoch": 2.202374979093494, + "grad_norm": 18.084337518881036, + "learning_rate": 2.0011753639026617e-07, + "logits/chosen": -0.18893378973007202, + "logits/rejected": -0.055924855172634125, + "logps/chosen": -3.2787792682647705, + "logps/rejected": -4.478785514831543, + "loss": 0.4588, + "rewards/accuracies": 0.8062499761581421, + "rewards/chosen": -3.2787792682647705, + "rewards/margins": 1.2000062465667725, + "rewards/rejected": -4.478785514831543, + "sft_loss": 3.457000732421875, + "step": 4115 + }, + { + "epoch": 2.2050510118748954, + "grad_norm": 20.83277114561886, + "learning_rate": 1.988727965115421e-07, + "logits/chosen": -0.24318411946296692, + "logits/rejected": -0.09616503119468689, + "logps/chosen": -3.1696531772613525, + "logps/rejected": -4.420243740081787, + "loss": 0.4518, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -3.1696531772613525, + "rewards/margins": 1.250590443611145, + "rewards/rejected": -4.420243740081787, + "sft_loss": 3.418315887451172, + "step": 4120 + }, + { + "epoch": 2.207727044656297, + "grad_norm": 16.275212367228942, + "learning_rate": 1.9763097817026713e-07, + "logits/chosen": -0.29338568449020386, + "logits/rejected": -0.045070819556713104, + "logps/chosen": -3.2043399810791016, + "logps/rejected": -4.738910675048828, + "loss": 0.3962, + "rewards/accuracies": 0.8374999761581421, + "rewards/chosen": -3.2043399810791016, + "rewards/margins": 1.5345704555511475, + "rewards/rejected": -4.738910675048828, + "sft_loss": 3.4029903411865234, + "step": 4125 + }, + { + "epoch": 2.210403077437699, + "grad_norm": 16.95359835468456, + "learning_rate": 1.9639209341456796e-07, + "logits/chosen": -0.14141836762428284, + "logits/rejected": -0.013434246182441711, + "logps/chosen": -3.340695858001709, + "logps/rejected": -4.655120372772217, + "loss": 0.462, + "rewards/accuracies": 0.768750011920929, + "rewards/chosen": -3.340695858001709, + "rewards/margins": 1.3144241571426392, + "rewards/rejected": -4.655120372772217, + "sft_loss": 3.5574791431427, + "step": 4130 + }, + { + "epoch": 2.2130791102191, + "grad_norm": 14.211923543904996, + "learning_rate": 1.951561542641102e-07, + "logits/chosen": -0.10375924408435822, + "logits/rejected": -0.10831058025360107, + "logps/chosen": -3.3312363624572754, + "logps/rejected": -4.6244707107543945, + "loss": 0.5066, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -3.3312363624572754, + "rewards/margins": 1.2932345867156982, + "rewards/rejected": -4.6244707107543945, + "sft_loss": 3.4978721141815186, + "step": 4135 + }, + { + "epoch": 2.215755143000502, + "grad_norm": 18.96497521571202, + "learning_rate": 1.939231727099806e-07, + "logits/chosen": -0.36096328496932983, + "logits/rejected": -0.27226656675338745, + "logps/chosen": -3.230943202972412, + "logps/rejected": -4.421606540679932, + "loss": 0.4769, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -3.230943202972412, + "rewards/margins": 1.1906629800796509, + "rewards/rejected": -4.421606540679932, + "sft_loss": 3.3588504791259766, + "step": 4140 + }, + { + "epoch": 2.2184311757819035, + "grad_norm": 17.525435531912834, + "learning_rate": 1.926931607145719e-07, + "logits/chosen": -0.09570387005805969, + "logits/rejected": 0.07199952751398087, + "logps/chosen": -3.4613890647888184, + "logps/rejected": -4.609736442565918, + "loss": 0.4874, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -3.4613890647888184, + "rewards/margins": 1.148347020149231, + "rewards/rejected": -4.609736442565918, + "sft_loss": 3.61700439453125, + "step": 4145 + }, + { + "epoch": 2.221107208563305, + "grad_norm": 15.292425629356083, + "learning_rate": 1.9146613021146564e-07, + "logits/chosen": -0.20875485241413116, + "logits/rejected": -0.06416013836860657, + "logps/chosen": -3.036895990371704, + "logps/rejected": -4.218943119049072, + "loss": 0.4766, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -3.036895990371704, + "rewards/margins": 1.1820474863052368, + "rewards/rejected": -4.218943119049072, + "sft_loss": 3.233987808227539, + "step": 4150 + }, + { + "epoch": 2.2237832413447065, + "grad_norm": 18.999966934345977, + "learning_rate": 1.9024209310531736e-07, + "logits/chosen": -0.17762920260429382, + "logits/rejected": -0.15139761567115784, + "logps/chosen": -3.201995372772217, + "logps/rejected": -4.4034857749938965, + "loss": 0.463, + "rewards/accuracies": 0.8187500238418579, + "rewards/chosen": -3.201995372772217, + "rewards/margins": 1.2014906406402588, + "rewards/rejected": -4.4034857749938965, + "sft_loss": 3.354133129119873, + "step": 4155 + }, + { + "epoch": 2.2264592741261082, + "grad_norm": 24.036186461034966, + "learning_rate": 1.890210612717401e-07, + "logits/chosen": -0.22414183616638184, + "logits/rejected": -0.04488285258412361, + "logps/chosen": -3.2703185081481934, + "logps/rejected": -4.445802688598633, + "loss": 0.4562, + "rewards/accuracies": 0.793749988079071, + "rewards/chosen": -3.2703185081481934, + "rewards/margins": 1.1754839420318604, + "rewards/rejected": -4.445802688598633, + "sft_loss": 3.437479019165039, + "step": 4160 + }, + { + "epoch": 2.2291353069075095, + "grad_norm": 20.103343151615263, + "learning_rate": 1.8780304655719054e-07, + "logits/chosen": -0.21300466358661652, + "logits/rejected": -0.032338377088308334, + "logps/chosen": -3.191817283630371, + "logps/rejected": -4.5335612297058105, + "loss": 0.4473, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -3.191817283630371, + "rewards/margins": 1.3417439460754395, + "rewards/rejected": -4.5335612297058105, + "sft_loss": 3.402716875076294, + "step": 4165 + }, + { + "epoch": 2.231811339688911, + "grad_norm": 27.46925171279301, + "learning_rate": 1.865880607788523e-07, + "logits/chosen": -0.037474703043699265, + "logits/rejected": 0.050803374499082565, + "logps/chosen": -3.1691462993621826, + "logps/rejected": -4.388891696929932, + "loss": 0.4655, + "rewards/accuracies": 0.793749988079071, + "rewards/chosen": -3.1691462993621826, + "rewards/margins": 1.2197450399398804, + "rewards/rejected": -4.388891696929932, + "sft_loss": 3.487267255783081, + "step": 4170 + }, + { + "epoch": 2.234487372470313, + "grad_norm": 23.89090887577621, + "learning_rate": 1.8537611572452316e-07, + "logits/chosen": -0.2057659924030304, + "logits/rejected": -0.08916045725345612, + "logps/chosen": -3.134235143661499, + "logps/rejected": -4.19904088973999, + "loss": 0.4793, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -3.134235143661499, + "rewards/margins": 1.0648062229156494, + "rewards/rejected": -4.19904088973999, + "sft_loss": 3.349034547805786, + "step": 4175 + }, + { + "epoch": 2.237163405251714, + "grad_norm": 17.81412265801276, + "learning_rate": 1.84167223152499e-07, + "logits/chosen": -0.2543816566467285, + "logits/rejected": 0.004586332943290472, + "logps/chosen": -3.1645865440368652, + "logps/rejected": -4.45761251449585, + "loss": 0.438, + "rewards/accuracies": 0.8187500238418579, + "rewards/chosen": -3.1645865440368652, + "rewards/margins": 1.2930257320404053, + "rewards/rejected": -4.45761251449585, + "sft_loss": 3.3859825134277344, + "step": 4180 + }, + { + "epoch": 2.239839438033116, + "grad_norm": 21.95605652435121, + "learning_rate": 1.8296139479146112e-07, + "logits/chosen": -0.2591497302055359, + "logits/rejected": -0.19623364508152008, + "logps/chosen": -3.06785249710083, + "logps/rejected": -4.301652908325195, + "loss": 0.4886, + "rewards/accuracies": 0.78125, + "rewards/chosen": -3.06785249710083, + "rewards/margins": 1.2338005304336548, + "rewards/rejected": -4.301652908325195, + "sft_loss": 3.2426631450653076, + "step": 4185 + }, + { + "epoch": 2.2425154708145176, + "grad_norm": 17.555340000578923, + "learning_rate": 1.8175864234036132e-07, + "logits/chosen": -0.07663850486278534, + "logits/rejected": 0.023242291063070297, + "logps/chosen": -3.1073408126831055, + "logps/rejected": -4.364688873291016, + "loss": 0.4727, + "rewards/accuracies": 0.8125, + "rewards/chosen": -3.1073408126831055, + "rewards/margins": 1.2573484182357788, + "rewards/rejected": -4.364688873291016, + "sft_loss": 3.2235629558563232, + "step": 4190 + }, + { + "epoch": 2.245191503595919, + "grad_norm": 16.461587993114804, + "learning_rate": 1.805589774683094e-07, + "logits/chosen": -0.33367663621902466, + "logits/rejected": -0.15988479554653168, + "logps/chosen": -3.0423474311828613, + "logps/rejected": -4.084300518035889, + "loss": 0.4826, + "rewards/accuracies": 0.78125, + "rewards/chosen": -3.0423474311828613, + "rewards/margins": 1.0419528484344482, + "rewards/rejected": -4.084300518035889, + "sft_loss": 3.2633557319641113, + "step": 4195 + }, + { + "epoch": 2.2478675363773206, + "grad_norm": 19.07017109089392, + "learning_rate": 1.79362411814459e-07, + "logits/chosen": -0.0908198133111, + "logits/rejected": -0.10554500669240952, + "logps/chosen": -3.247302532196045, + "logps/rejected": -4.188872814178467, + "loss": 0.5486, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -3.247302532196045, + "rewards/margins": 0.9415702819824219, + "rewards/rejected": -4.188872814178467, + "sft_loss": 3.4122467041015625, + "step": 4200 + }, + { + "epoch": 2.2505435691587223, + "grad_norm": 15.418712620736029, + "learning_rate": 1.7816895698789552e-07, + "logits/chosen": -0.2686876654624939, + "logits/rejected": -0.1589660346508026, + "logps/chosen": -3.0749001502990723, + "logps/rejected": -4.215682506561279, + "loss": 0.4496, + "rewards/accuracies": 0.8187500238418579, + "rewards/chosen": -3.0749001502990723, + "rewards/margins": 1.140782117843628, + "rewards/rejected": -4.215682506561279, + "sft_loss": 3.247279405593872, + "step": 4205 + }, + { + "epoch": 2.2532196019401236, + "grad_norm": 14.329212400039616, + "learning_rate": 1.7697862456752271e-07, + "logits/chosen": -0.2475365698337555, + "logits/rejected": -0.07634967565536499, + "logps/chosen": -3.143423318862915, + "logps/rejected": -4.52433967590332, + "loss": 0.4364, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -3.143423318862915, + "rewards/margins": 1.3809159994125366, + "rewards/rejected": -4.52433967590332, + "sft_loss": 3.3332431316375732, + "step": 4210 + }, + { + "epoch": 2.2558956347215253, + "grad_norm": 18.48070952568049, + "learning_rate": 1.7579142610195124e-07, + "logits/chosen": -0.21497571468353271, + "logits/rejected": -0.01970536634325981, + "logps/chosen": -3.140994071960449, + "logps/rejected": -4.3509111404418945, + "loss": 0.4771, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -3.140994071960449, + "rewards/margins": 1.2099171876907349, + "rewards/rejected": -4.3509111404418945, + "sft_loss": 3.2334461212158203, + "step": 4215 + }, + { + "epoch": 2.258571667502927, + "grad_norm": 13.278055635617374, + "learning_rate": 1.7460737310938568e-07, + "logits/chosen": -0.2955940067768097, + "logits/rejected": -0.019859764724969864, + "logps/chosen": -2.979950428009033, + "logps/rejected": -4.360445499420166, + "loss": 0.413, + "rewards/accuracies": 0.84375, + "rewards/chosen": -2.979950428009033, + "rewards/margins": 1.3804947137832642, + "rewards/rejected": -4.360445499420166, + "sft_loss": 3.183371067047119, + "step": 4220 + }, + { + "epoch": 2.2612477002843283, + "grad_norm": 14.165936883872977, + "learning_rate": 1.734264770775133e-07, + "logits/chosen": -0.25720852613449097, + "logits/rejected": 0.03133372589945793, + "logps/chosen": -3.1578311920166016, + "logps/rejected": -4.32140588760376, + "loss": 0.4857, + "rewards/accuracies": 0.75, + "rewards/chosen": -3.1578311920166016, + "rewards/margins": 1.1635749340057373, + "rewards/rejected": -4.32140588760376, + "sft_loss": 3.2801055908203125, + "step": 4225 + }, + { + "epoch": 2.26392373306573, + "grad_norm": 16.085409013269818, + "learning_rate": 1.7224874946339241e-07, + "logits/chosen": -0.23832368850708008, + "logits/rejected": -0.12906375527381897, + "logps/chosen": -3.1299870014190674, + "logps/rejected": -4.382236480712891, + "loss": 0.4664, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -3.1299870014190674, + "rewards/margins": 1.2522494792938232, + "rewards/rejected": -4.382236480712891, + "sft_loss": 3.1819064617156982, + "step": 4230 + }, + { + "epoch": 2.2665997658471317, + "grad_norm": 14.745450418612242, + "learning_rate": 1.7107420169334186e-07, + "logits/chosen": -0.19955693185329437, + "logits/rejected": -0.08378178626298904, + "logps/chosen": -3.154358386993408, + "logps/rejected": -4.298888206481934, + "loss": 0.486, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -3.154358386993408, + "rewards/margins": 1.1445305347442627, + "rewards/rejected": -4.298888206481934, + "sft_loss": 3.3586158752441406, + "step": 4235 + }, + { + "epoch": 2.269275798628533, + "grad_norm": 16.213106248209986, + "learning_rate": 1.6990284516282893e-07, + "logits/chosen": -0.21659307181835175, + "logits/rejected": -0.07814633846282959, + "logps/chosen": -3.015805721282959, + "logps/rejected": -4.245603561401367, + "loss": 0.438, + "rewards/accuracies": 0.768750011920929, + "rewards/chosen": -3.015805721282959, + "rewards/margins": 1.2297978401184082, + "rewards/rejected": -4.245603561401367, + "sft_loss": 3.2131104469299316, + "step": 4240 + }, + { + "epoch": 2.2719518314099347, + "grad_norm": 19.323395120992416, + "learning_rate": 1.687346912363602e-07, + "logits/chosen": -0.23957762122154236, + "logits/rejected": -0.04110132157802582, + "logps/chosen": -3.1102561950683594, + "logps/rejected": -4.384282112121582, + "loss": 0.4427, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -3.1102561950683594, + "rewards/margins": 1.2740260362625122, + "rewards/rejected": -4.384282112121582, + "sft_loss": 3.2916996479034424, + "step": 4245 + }, + { + "epoch": 2.2746278641913364, + "grad_norm": 16.581508891925253, + "learning_rate": 1.675697512473697e-07, + "logits/chosen": -0.255021333694458, + "logits/rejected": -0.0009258926147595048, + "logps/chosen": -3.1822104454040527, + "logps/rejected": -4.498100757598877, + "loss": 0.4329, + "rewards/accuracies": 0.8374999761581421, + "rewards/chosen": -3.1822104454040527, + "rewards/margins": 1.3158907890319824, + "rewards/rejected": -4.498100757598877, + "sft_loss": 3.278721332550049, + "step": 4250 + }, + { + "epoch": 2.2773038969727377, + "grad_norm": 18.7455944053278, + "learning_rate": 1.6640803649811087e-07, + "logits/chosen": -0.23095567524433136, + "logits/rejected": 0.0882527232170105, + "logps/chosen": -3.2245659828186035, + "logps/rejected": -4.542080879211426, + "loss": 0.435, + "rewards/accuracies": 0.8374999761581421, + "rewards/chosen": -3.2245659828186035, + "rewards/margins": 1.3175151348114014, + "rewards/rejected": -4.542080879211426, + "sft_loss": 3.3141732215881348, + "step": 4255 + }, + { + "epoch": 2.2799799297541394, + "grad_norm": 21.117638663135256, + "learning_rate": 1.6524955825954472e-07, + "logits/chosen": -0.1814691722393036, + "logits/rejected": -0.07471819221973419, + "logps/chosen": -3.1403911113739014, + "logps/rejected": -4.332183837890625, + "loss": 0.4644, + "rewards/accuracies": 0.78125, + "rewards/chosen": -3.1403911113739014, + "rewards/margins": 1.1917924880981445, + "rewards/rejected": -4.332183837890625, + "sft_loss": 3.2049949169158936, + "step": 4260 + }, + { + "epoch": 2.282655962535541, + "grad_norm": 15.627427866447883, + "learning_rate": 1.6409432777123277e-07, + "logits/chosen": -0.2731134295463562, + "logits/rejected": -0.046000413596630096, + "logps/chosen": -3.2276968955993652, + "logps/rejected": -4.660434722900391, + "loss": 0.4399, + "rewards/accuracies": 0.831250011920929, + "rewards/chosen": -3.2276968955993652, + "rewards/margins": 1.432737946510315, + "rewards/rejected": -4.660434722900391, + "sft_loss": 3.427386522293091, + "step": 4265 + }, + { + "epoch": 2.285331995316943, + "grad_norm": 18.08578740103005, + "learning_rate": 1.6294235624122577e-07, + "logits/chosen": -0.138192817568779, + "logits/rejected": 0.15803620219230652, + "logps/chosen": -3.313798189163208, + "logps/rejected": -4.528017997741699, + "loss": 0.4846, + "rewards/accuracies": 0.75, + "rewards/chosen": -3.313798189163208, + "rewards/margins": 1.2142199277877808, + "rewards/rejected": -4.528017997741699, + "sft_loss": 3.413130521774292, + "step": 4270 + }, + { + "epoch": 2.288008028098344, + "grad_norm": 18.778224971256115, + "learning_rate": 1.6179365484595697e-07, + "logits/chosen": -0.20826223492622375, + "logits/rejected": -0.047176480293273926, + "logps/chosen": -3.294180393218994, + "logps/rejected": -4.4106550216674805, + "loss": 0.4984, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -3.294180393218994, + "rewards/margins": 1.1164753437042236, + "rewards/rejected": -4.4106550216674805, + "sft_loss": 3.4413559436798096, + "step": 4275 + }, + { + "epoch": 2.290684060879746, + "grad_norm": 17.130370672952917, + "learning_rate": 1.60648234730132e-07, + "logits/chosen": -0.22210946679115295, + "logits/rejected": -0.08392616361379623, + "logps/chosen": -3.220755100250244, + "logps/rejected": -4.539450645446777, + "loss": 0.4222, + "rewards/accuracies": 0.8187500238418579, + "rewards/chosen": -3.220755100250244, + "rewards/margins": 1.3186959028244019, + "rewards/rejected": -4.539450645446777, + "sft_loss": 3.375286102294922, + "step": 4280 + }, + { + "epoch": 2.293360093661147, + "grad_norm": 29.69947353654119, + "learning_rate": 1.595061070066222e-07, + "logits/chosen": -0.16046538949012756, + "logits/rejected": -0.1463548094034195, + "logps/chosen": -3.1819510459899902, + "logps/rejected": -4.588136196136475, + "loss": 0.4208, + "rewards/accuracies": 0.8125, + "rewards/chosen": -3.1819510459899902, + "rewards/margins": 1.4061849117279053, + "rewards/rejected": -4.588136196136475, + "sft_loss": 3.3452370166778564, + "step": 4285 + }, + { + "epoch": 2.296036126442549, + "grad_norm": 25.87574365274802, + "learning_rate": 1.5836728275635542e-07, + "logits/chosen": -0.3043631911277771, + "logits/rejected": -0.08773352950811386, + "logps/chosen": -3.3570823669433594, + "logps/rejected": -4.547373294830322, + "loss": 0.4848, + "rewards/accuracies": 0.793749988079071, + "rewards/chosen": -3.3570823669433594, + "rewards/margins": 1.1902905702590942, + "rewards/rejected": -4.547373294830322, + "sft_loss": 3.4400806427001953, + "step": 4290 + }, + { + "epoch": 2.2987121592239506, + "grad_norm": 23.134324933736, + "learning_rate": 1.5723177302820984e-07, + "logits/chosen": -0.2654676139354706, + "logits/rejected": -0.1498531848192215, + "logps/chosen": -3.3259081840515137, + "logps/rejected": -4.402651786804199, + "loss": 0.4778, + "rewards/accuracies": 0.8374999761581421, + "rewards/chosen": -3.3259081840515137, + "rewards/margins": 1.076743483543396, + "rewards/rejected": -4.402651786804199, + "sft_loss": 3.395968198776245, + "step": 4295 + }, + { + "epoch": 2.3013881920053523, + "grad_norm": 17.202436809824984, + "learning_rate": 1.5609958883890544e-07, + "logits/chosen": -0.18384288251399994, + "logits/rejected": -0.0234028659760952, + "logps/chosen": -3.232166290283203, + "logps/rejected": -4.441971778869629, + "loss": 0.4294, + "rewards/accuracies": 0.831250011920929, + "rewards/chosen": -3.232166290283203, + "rewards/margins": 1.2098052501678467, + "rewards/rejected": -4.441971778869629, + "sft_loss": 3.3029544353485107, + "step": 4300 + }, + { + "epoch": 2.3040642247867535, + "grad_norm": 15.9017208826614, + "learning_rate": 1.5497074117289865e-07, + "logits/chosen": -0.3002277910709381, + "logits/rejected": -0.1476120501756668, + "logps/chosen": -3.1275136470794678, + "logps/rejected": -4.544375419616699, + "loss": 0.4294, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -3.1275136470794678, + "rewards/margins": 1.416861653327942, + "rewards/rejected": -4.544375419616699, + "sft_loss": 3.3830864429473877, + "step": 4305 + }, + { + "epoch": 2.3067402575681553, + "grad_norm": 17.702231436246656, + "learning_rate": 1.5384524098227402e-07, + "logits/chosen": -0.24221201241016388, + "logits/rejected": 0.003594371723011136, + "logps/chosen": -3.3001155853271484, + "logps/rejected": -4.792794227600098, + "loss": 0.3949, + "rewards/accuracies": 0.862500011920929, + "rewards/chosen": -3.3001155853271484, + "rewards/margins": 1.4926788806915283, + "rewards/rejected": -4.792794227600098, + "sft_loss": 3.482017993927002, + "step": 4310 + }, + { + "epoch": 2.3094162903495565, + "grad_norm": 19.830644859083993, + "learning_rate": 1.5272309918663974e-07, + "logits/chosen": -0.21288709342479706, + "logits/rejected": -0.005537429358810186, + "logps/chosen": -3.3459548950195312, + "logps/rejected": -4.385908126831055, + "loss": 0.5239, + "rewards/accuracies": 0.768750011920929, + "rewards/chosen": -3.3459548950195312, + "rewards/margins": 1.0399537086486816, + "rewards/rejected": -4.385908126831055, + "sft_loss": 3.5866851806640625, + "step": 4315 + }, + { + "epoch": 2.3120923231309582, + "grad_norm": 17.455952905286868, + "learning_rate": 1.516043266730201e-07, + "logits/chosen": -0.2241850644350052, + "logits/rejected": -0.025332236662507057, + "logps/chosen": -3.29628324508667, + "logps/rejected": -4.559080600738525, + "loss": 0.4467, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -3.29628324508667, + "rewards/margins": 1.2627968788146973, + "rewards/rejected": -4.559080600738525, + "sft_loss": 3.4109034538269043, + "step": 4320 + }, + { + "epoch": 2.31476835591236, + "grad_norm": 29.111777110334728, + "learning_rate": 1.504889342957512e-07, + "logits/chosen": -0.20829851925373077, + "logits/rejected": -0.0016296729445457458, + "logps/chosen": -3.3091533184051514, + "logps/rejected": -4.5012311935424805, + "loss": 0.5246, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -3.3091533184051514, + "rewards/margins": 1.1920771598815918, + "rewards/rejected": -4.5012311935424805, + "sft_loss": 3.4718575477600098, + "step": 4325 + }, + { + "epoch": 2.3174443886937617, + "grad_norm": 19.87629918740408, + "learning_rate": 1.4937693287637453e-07, + "logits/chosen": -0.22584417462348938, + "logits/rejected": -0.039196573197841644, + "logps/chosen": -3.3293514251708984, + "logps/rejected": -4.511477470397949, + "loss": 0.4871, + "rewards/accuracies": 0.8125, + "rewards/chosen": -3.3293514251708984, + "rewards/margins": 1.1821262836456299, + "rewards/rejected": -4.511477470397949, + "sft_loss": 3.4103927612304688, + "step": 4330 + }, + { + "epoch": 2.320120421475163, + "grad_norm": 16.06874907463283, + "learning_rate": 1.4826833320353305e-07, + "logits/chosen": -0.22309021651744843, + "logits/rejected": -0.1022438034415245, + "logps/chosen": -3.1893844604492188, + "logps/rejected": -4.463017463684082, + "loss": 0.4437, + "rewards/accuracies": 0.831250011920929, + "rewards/chosen": -3.1893844604492188, + "rewards/margins": 1.2736327648162842, + "rewards/rejected": -4.463017463684082, + "sft_loss": 3.256054639816284, + "step": 4335 + }, + { + "epoch": 2.3227964542565647, + "grad_norm": 21.328061639796502, + "learning_rate": 1.4716314603286528e-07, + "logits/chosen": -0.2732570767402649, + "logits/rejected": -0.022668302059173584, + "logps/chosen": -3.1785480976104736, + "logps/rejected": -4.607804298400879, + "loss": 0.3982, + "rewards/accuracies": 0.84375, + "rewards/chosen": -3.1785480976104736, + "rewards/margins": 1.429255723953247, + "rewards/rejected": -4.607804298400879, + "sft_loss": 3.4070191383361816, + "step": 4340 + }, + { + "epoch": 2.3254724870379664, + "grad_norm": 30.04601648990877, + "learning_rate": 1.4606138208690233e-07, + "logits/chosen": -0.2265649288892746, + "logits/rejected": -0.12993165850639343, + "logps/chosen": -3.307100772857666, + "logps/rejected": -4.453924655914307, + "loss": 0.5078, + "rewards/accuracies": 0.768750011920929, + "rewards/chosen": -3.307100772857666, + "rewards/margins": 1.1468241214752197, + "rewards/rejected": -4.453924655914307, + "sft_loss": 3.414353847503662, + "step": 4345 + }, + { + "epoch": 2.3281485198193677, + "grad_norm": 16.1389666815883, + "learning_rate": 1.4496305205496251e-07, + "logits/chosen": -0.1859712153673172, + "logits/rejected": -0.08136789500713348, + "logps/chosen": -3.3702845573425293, + "logps/rejected": -4.737895965576172, + "loss": 0.4478, + "rewards/accuracies": 0.8062499761581421, + "rewards/chosen": -3.3702845573425293, + "rewards/margins": 1.3676111698150635, + "rewards/rejected": -4.737895965576172, + "sft_loss": 3.4948036670684814, + "step": 4350 + }, + { + "epoch": 2.3308245526007694, + "grad_norm": 15.328700754654726, + "learning_rate": 1.4386816659304895e-07, + "logits/chosen": -0.31797105073928833, + "logits/rejected": -0.13324691355228424, + "logps/chosen": -3.2428574562072754, + "logps/rejected": -4.482692718505859, + "loss": 0.4206, + "rewards/accuracies": 0.8687499761581421, + "rewards/chosen": -3.2428574562072754, + "rewards/margins": 1.2398353815078735, + "rewards/rejected": -4.482692718505859, + "sft_loss": 3.4420928955078125, + "step": 4355 + }, + { + "epoch": 2.333500585382171, + "grad_norm": 19.534348069559012, + "learning_rate": 1.4277673632374492e-07, + "logits/chosen": -0.28887489438056946, + "logits/rejected": 0.003976461477577686, + "logps/chosen": -3.2906270027160645, + "logps/rejected": -4.527071952819824, + "loss": 0.451, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -3.2906270027160645, + "rewards/margins": 1.2364448308944702, + "rewards/rejected": -4.527071952819824, + "sft_loss": 3.4399611949920654, + "step": 4360 + }, + { + "epoch": 2.3361766181635724, + "grad_norm": 15.875875054436587, + "learning_rate": 1.416887718361119e-07, + "logits/chosen": -0.12166640907526016, + "logits/rejected": -0.08416490256786346, + "logps/chosen": -3.2442119121551514, + "logps/rejected": -4.411725044250488, + "loss": 0.4747, + "rewards/accuracies": 0.8187500238418579, + "rewards/chosen": -3.2442119121551514, + "rewards/margins": 1.1675128936767578, + "rewards/rejected": -4.411725044250488, + "sft_loss": 3.3793673515319824, + "step": 4365 + }, + { + "epoch": 2.338852650944974, + "grad_norm": 21.08263580693356, + "learning_rate": 1.406042836855859e-07, + "logits/chosen": -0.1815926879644394, + "logits/rejected": -0.031064201146364212, + "logps/chosen": -3.109001874923706, + "logps/rejected": -4.556884765625, + "loss": 0.3986, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -3.109001874923706, + "rewards/margins": 1.4478830099105835, + "rewards/rejected": -4.556884765625, + "sft_loss": 3.2362945079803467, + "step": 4370 + }, + { + "epoch": 2.341528683726376, + "grad_norm": 23.273463422383422, + "learning_rate": 1.3952328239387595e-07, + "logits/chosen": -0.30537500977516174, + "logits/rejected": -0.017103061079978943, + "logps/chosen": -3.161263942718506, + "logps/rejected": -4.516915798187256, + "loss": 0.4535, + "rewards/accuracies": 0.793749988079071, + "rewards/chosen": -3.161263942718506, + "rewards/margins": 1.355651617050171, + "rewards/rejected": -4.516915798187256, + "sft_loss": 3.4046154022216797, + "step": 4375 + }, + { + "epoch": 2.344204716507777, + "grad_norm": 17.58767621350646, + "learning_rate": 1.3844577844886109e-07, + "logits/chosen": -0.30194127559661865, + "logits/rejected": -0.034553758800029755, + "logps/chosen": -3.2942283153533936, + "logps/rejected": -4.528740882873535, + "loss": 0.4589, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -3.2942283153533936, + "rewards/margins": 1.2345125675201416, + "rewards/rejected": -4.528740882873535, + "sft_loss": 3.4428882598876953, + "step": 4380 + }, + { + "epoch": 2.346880749289179, + "grad_norm": 21.7044093632797, + "learning_rate": 1.3737178230448955e-07, + "logits/chosen": -0.2997553050518036, + "logits/rejected": -0.13085004687309265, + "logps/chosen": -3.28190541267395, + "logps/rejected": -4.441516399383545, + "loss": 0.4993, + "rewards/accuracies": 0.8062499761581421, + "rewards/chosen": -3.28190541267395, + "rewards/margins": 1.1596112251281738, + "rewards/rejected": -4.441516399383545, + "sft_loss": 3.4178061485290527, + "step": 4385 + }, + { + "epoch": 2.3495567820705805, + "grad_norm": 15.029912018578793, + "learning_rate": 1.363013043806764e-07, + "logits/chosen": -0.2474454939365387, + "logits/rejected": -0.08195488899946213, + "logps/chosen": -3.1726186275482178, + "logps/rejected": -4.371608257293701, + "loss": 0.4506, + "rewards/accuracies": 0.793749988079071, + "rewards/chosen": -3.1726186275482178, + "rewards/margins": 1.1989902257919312, + "rewards/rejected": -4.371608257293701, + "sft_loss": 3.3866195678710938, + "step": 4390 + }, + { + "epoch": 2.3522328148519818, + "grad_norm": 18.499020289783466, + "learning_rate": 1.352343550632034e-07, + "logits/chosen": -0.2626621127128601, + "logits/rejected": -0.05871880054473877, + "logps/chosen": -3.1768505573272705, + "logps/rejected": -4.5565314292907715, + "loss": 0.4419, + "rewards/accuracies": 0.8374999761581421, + "rewards/chosen": -3.1768505573272705, + "rewards/margins": 1.3796809911727905, + "rewards/rejected": -4.5565314292907715, + "sft_loss": 3.2931389808654785, + "step": 4395 + }, + { + "epoch": 2.3549088476333835, + "grad_norm": 15.70069909859834, + "learning_rate": 1.3417094470361722e-07, + "logits/chosen": -0.256940633058548, + "logits/rejected": -0.08759448677301407, + "logps/chosen": -3.290684938430786, + "logps/rejected": -4.433601379394531, + "loss": 0.4882, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -3.290684938430786, + "rewards/margins": 1.1429167985916138, + "rewards/rejected": -4.433601379394531, + "sft_loss": 3.5128185749053955, + "step": 4400 + }, + { + "epoch": 2.3549088476333835, + "eval_logits/chosen": 0.1602650135755539, + "eval_logits/rejected": 0.28520262241363525, + "eval_logps/chosen": -3.3795173168182373, + "eval_logps/rejected": -4.435532569885254, + "eval_loss": 0.560058057308197, + "eval_rewards/accuracies": 0.7270029783248901, + "eval_rewards/chosen": -3.3795173168182373, + "eval_rewards/margins": 1.0560152530670166, + "eval_rewards/rejected": -4.435532569885254, + "eval_runtime": 51.3281, + "eval_samples_per_second": 26.204, + "eval_sft_loss": 3.520069122314453, + "eval_steps_per_second": 6.566, + "step": 4400 + }, + { + "epoch": 2.357584880414785, + "grad_norm": 17.075591569473236, + "learning_rate": 1.3311108361913015e-07, + "logits/chosen": -0.2871529161930084, + "logits/rejected": -0.22709603607654572, + "logps/chosen": -3.151932954788208, + "logps/rejected": -4.415287971496582, + "loss": 0.4184, + "rewards/accuracies": 0.862500011920929, + "rewards/chosen": -3.151932954788208, + "rewards/margins": 1.2633541822433472, + "rewards/rejected": -4.415287971496582, + "sft_loss": 3.2805488109588623, + "step": 4405 + }, + { + "epoch": 2.3602609131961865, + "grad_norm": 14.639353219888728, + "learning_rate": 1.3205478209251874e-07, + "logits/chosen": -0.20357315242290497, + "logits/rejected": -0.0712977796792984, + "logps/chosen": -3.374850034713745, + "logps/rejected": -4.77631950378418, + "loss": 0.4383, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -3.374850034713745, + "rewards/margins": 1.401469349861145, + "rewards/rejected": -4.77631950378418, + "sft_loss": 3.5529561042785645, + "step": 4410 + }, + { + "epoch": 2.362936945977588, + "grad_norm": 14.59497078621693, + "learning_rate": 1.310020503720254e-07, + "logits/chosen": -0.21676579117774963, + "logits/rejected": 0.007253182120621204, + "logps/chosen": -3.2841403484344482, + "logps/rejected": -4.623076438903809, + "loss": 0.4617, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -3.2841403484344482, + "rewards/margins": 1.3389358520507812, + "rewards/rejected": -4.623076438903809, + "sft_loss": 3.3688995838165283, + "step": 4415 + }, + { + "epoch": 2.36561297875899, + "grad_norm": 19.331359521271466, + "learning_rate": 1.2995289867125752e-07, + "logits/chosen": -0.22291286289691925, + "logits/rejected": -0.1075035110116005, + "logps/chosen": -3.3117096424102783, + "logps/rejected": -4.385438442230225, + "loss": 0.4872, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -3.3117096424102783, + "rewards/margins": 1.0737292766571045, + "rewards/rejected": -4.385438442230225, + "sft_loss": 3.410828113555908, + "step": 4420 + }, + { + "epoch": 2.368289011540391, + "grad_norm": 15.560749043815754, + "learning_rate": 1.2890733716908986e-07, + "logits/chosen": -0.22724834084510803, + "logits/rejected": -0.11599922180175781, + "logps/chosen": -3.066779375076294, + "logps/rejected": -4.344001293182373, + "loss": 0.3823, + "rewards/accuracies": 0.893750011920929, + "rewards/chosen": -3.066779375076294, + "rewards/margins": 1.2772223949432373, + "rewards/rejected": -4.344001293182373, + "sft_loss": 3.265014171600342, + "step": 4425 + }, + { + "epoch": 2.370965044321793, + "grad_norm": 21.602036138164067, + "learning_rate": 1.2786537600956454e-07, + "logits/chosen": -0.28180190920829773, + "logits/rejected": -0.04159663990139961, + "logps/chosen": -3.2508647441864014, + "logps/rejected": -4.5376296043396, + "loss": 0.4401, + "rewards/accuracies": 0.8187500238418579, + "rewards/chosen": -3.2508647441864014, + "rewards/margins": 1.2867653369903564, + "rewards/rejected": -4.5376296043396, + "sft_loss": 3.342550754547119, + "step": 4430 + }, + { + "epoch": 2.3736410771031946, + "grad_norm": 15.007896979622949, + "learning_rate": 1.268270253017933e-07, + "logits/chosen": -0.26863187551498413, + "logits/rejected": -0.023631075397133827, + "logps/chosen": -3.238671064376831, + "logps/rejected": -4.531284332275391, + "loss": 0.4468, + "rewards/accuracies": 0.78125, + "rewards/chosen": -3.238671064376831, + "rewards/margins": 1.2926135063171387, + "rewards/rejected": -4.531284332275391, + "sft_loss": 3.482830047607422, + "step": 4435 + }, + { + "epoch": 2.376317109884596, + "grad_norm": 15.623102106634239, + "learning_rate": 1.257922951198591e-07, + "logits/chosen": -0.3622228503227234, + "logits/rejected": -0.029821058735251427, + "logps/chosen": -3.1601808071136475, + "logps/rejected": -4.420136451721191, + "loss": 0.4584, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -3.1601808071136475, + "rewards/margins": 1.2599560022354126, + "rewards/rejected": -4.420136451721191, + "sft_loss": 3.3236122131347656, + "step": 4440 + }, + { + "epoch": 2.3789931426659976, + "grad_norm": 25.78092111116869, + "learning_rate": 1.24761195502719e-07, + "logits/chosen": -0.2920494079589844, + "logits/rejected": -0.02239333651959896, + "logps/chosen": -3.278372287750244, + "logps/rejected": -4.271549224853516, + "loss": 0.5356, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -3.278372287750244, + "rewards/margins": 0.9931772947311401, + "rewards/rejected": -4.271549224853516, + "sft_loss": 3.4388442039489746, + "step": 4445 + }, + { + "epoch": 2.3816691754473993, + "grad_norm": 21.12682235871762, + "learning_rate": 1.2373373645410573e-07, + "logits/chosen": -0.2140198051929474, + "logits/rejected": -0.035846177488565445, + "logps/chosen": -3.3080954551696777, + "logps/rejected": -4.657708644866943, + "loss": 0.4588, + "rewards/accuracies": 0.8187500238418579, + "rewards/chosen": -3.3080954551696777, + "rewards/margins": 1.3496134281158447, + "rewards/rejected": -4.657708644866943, + "sft_loss": 3.464963436126709, + "step": 4450 + }, + { + "epoch": 2.384345208228801, + "grad_norm": 19.12954636525194, + "learning_rate": 1.2270992794243175e-07, + "logits/chosen": -0.30315282940864563, + "logits/rejected": -0.14045214653015137, + "logps/chosen": -3.2173290252685547, + "logps/rejected": -4.5085835456848145, + "loss": 0.452, + "rewards/accuracies": 0.831250011920929, + "rewards/chosen": -3.2173290252685547, + "rewards/margins": 1.2912541627883911, + "rewards/rejected": -4.5085835456848145, + "sft_loss": 3.383535861968994, + "step": 4455 + }, + { + "epoch": 2.3870212410102023, + "grad_norm": 12.617700363992414, + "learning_rate": 1.2168977990069147e-07, + "logits/chosen": -0.29781395196914673, + "logits/rejected": -0.04217588156461716, + "logps/chosen": -3.1576359272003174, + "logps/rejected": -4.424050331115723, + "loss": 0.4447, + "rewards/accuracies": 0.793749988079071, + "rewards/chosen": -3.1576359272003174, + "rewards/margins": 1.2664148807525635, + "rewards/rejected": -4.424050331115723, + "sft_loss": 3.398397922515869, + "step": 4460 + }, + { + "epoch": 2.389697273791604, + "grad_norm": 21.343624441388958, + "learning_rate": 1.206733022263659e-07, + "logits/chosen": -0.2910090386867523, + "logits/rejected": -0.040756385773420334, + "logps/chosen": -3.4104666709899902, + "logps/rejected": -4.600484371185303, + "loss": 0.5062, + "rewards/accuracies": 0.8062499761581421, + "rewards/chosen": -3.4104666709899902, + "rewards/margins": 1.1900174617767334, + "rewards/rejected": -4.600484371185303, + "sft_loss": 3.4890716075897217, + "step": 4465 + }, + { + "epoch": 2.3923733065730053, + "grad_norm": 15.448295342736577, + "learning_rate": 1.1966050478132572e-07, + "logits/chosen": -0.20816560089588165, + "logits/rejected": -0.08821313083171844, + "logps/chosen": -3.159154176712036, + "logps/rejected": -4.3316545486450195, + "loss": 0.4997, + "rewards/accuracies": 0.8062499761581421, + "rewards/chosen": -3.159154176712036, + "rewards/margins": 1.1724998950958252, + "rewards/rejected": -4.3316545486450195, + "sft_loss": 3.3820927143096924, + "step": 4470 + }, + { + "epoch": 2.395049339354407, + "grad_norm": 20.533095086063796, + "learning_rate": 1.1865139739173635e-07, + "logits/chosen": -0.2584991157054901, + "logits/rejected": -0.008012844249606133, + "logps/chosen": -3.2850277423858643, + "logps/rejected": -4.464526176452637, + "loss": 0.4552, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -3.2850277423858643, + "rewards/margins": 1.1794984340667725, + "rewards/rejected": -4.464526176452637, + "sft_loss": 3.3650975227355957, + "step": 4475 + }, + { + "epoch": 2.3977253721358087, + "grad_norm": 18.04195391600987, + "learning_rate": 1.1764598984796187e-07, + "logits/chosen": -0.30945929884910583, + "logits/rejected": -0.12186364829540253, + "logps/chosen": -3.1606342792510986, + "logps/rejected": -4.291658401489258, + "loss": 0.4505, + "rewards/accuracies": 0.8125, + "rewards/chosen": -3.1606342792510986, + "rewards/margins": 1.1310243606567383, + "rewards/rejected": -4.291658401489258, + "sft_loss": 3.3320159912109375, + "step": 4480 + }, + { + "epoch": 2.4004014049172104, + "grad_norm": 21.2521243699826, + "learning_rate": 1.1664429190447095e-07, + "logits/chosen": -0.16520674526691437, + "logits/rejected": -0.06304512917995453, + "logps/chosen": -3.2640156745910645, + "logps/rejected": -4.534733772277832, + "loss": 0.4605, + "rewards/accuracies": 0.78125, + "rewards/chosen": -3.2640156745910645, + "rewards/margins": 1.2707182168960571, + "rewards/rejected": -4.534733772277832, + "sft_loss": 3.390946865081787, + "step": 4485 + }, + { + "epoch": 2.4030774376986117, + "grad_norm": 23.375242113989113, + "learning_rate": 1.1564631327974122e-07, + "logits/chosen": -0.26219305396080017, + "logits/rejected": -0.012254145927727222, + "logps/chosen": -3.255249500274658, + "logps/rejected": -4.6344709396362305, + "loss": 0.4409, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -3.255249500274658, + "rewards/margins": 1.3792214393615723, + "rewards/rejected": -4.6344709396362305, + "sft_loss": 3.460409641265869, + "step": 4490 + }, + { + "epoch": 2.4057534704800134, + "grad_norm": 17.363393487201144, + "learning_rate": 1.1465206365616587e-07, + "logits/chosen": -0.3416324853897095, + "logits/rejected": -0.07559507340192795, + "logps/chosen": -3.3181662559509277, + "logps/rejected": -4.4394731521606445, + "loss": 0.4865, + "rewards/accuracies": 0.768750011920929, + "rewards/chosen": -3.3181662559509277, + "rewards/margins": 1.1213066577911377, + "rewards/rejected": -4.4394731521606445, + "sft_loss": 3.470642566680908, + "step": 4495 + }, + { + "epoch": 2.408429503261415, + "grad_norm": 17.222664032980234, + "learning_rate": 1.1366155267995887e-07, + "logits/chosen": -0.19300106167793274, + "logits/rejected": -0.13747408986091614, + "logps/chosen": -3.1528539657592773, + "logps/rejected": -4.371302604675293, + "loss": 0.4426, + "rewards/accuracies": 0.8125, + "rewards/chosen": -3.1528539657592773, + "rewards/margins": 1.218449354171753, + "rewards/rejected": -4.371302604675293, + "sft_loss": 3.3510098457336426, + "step": 4500 + }, + { + "epoch": 2.4111055360428164, + "grad_norm": 18.88858469752771, + "learning_rate": 1.1267478996106228e-07, + "logits/chosen": -0.24253828823566437, + "logits/rejected": 0.018031585961580276, + "logps/chosen": -3.231499195098877, + "logps/rejected": -4.309110164642334, + "loss": 0.4907, + "rewards/accuracies": 0.793749988079071, + "rewards/chosen": -3.231499195098877, + "rewards/margins": 1.077610731124878, + "rewards/rejected": -4.309110164642334, + "sft_loss": 3.3755271434783936, + "step": 4505 + }, + { + "epoch": 2.413781568824218, + "grad_norm": 16.997664747937158, + "learning_rate": 1.116917850730521e-07, + "logits/chosen": -0.26118624210357666, + "logits/rejected": -0.07530129700899124, + "logps/chosen": -3.262040615081787, + "logps/rejected": -4.351678371429443, + "loss": 0.5123, + "rewards/accuracies": 0.768750011920929, + "rewards/chosen": -3.262040615081787, + "rewards/margins": 1.0896382331848145, + "rewards/rejected": -4.351678371429443, + "sft_loss": 3.352389097213745, + "step": 4510 + }, + { + "epoch": 2.41645760160562, + "grad_norm": 16.598529612655412, + "learning_rate": 1.1071254755304637e-07, + "logits/chosen": -0.23019631206989288, + "logits/rejected": -0.12985627353191376, + "logps/chosen": -3.1173079013824463, + "logps/rejected": -4.230454921722412, + "loss": 0.5031, + "rewards/accuracies": 0.8062499761581421, + "rewards/chosen": -3.1173079013824463, + "rewards/margins": 1.1131466627120972, + "rewards/rejected": -4.230454921722412, + "sft_loss": 3.243206024169922, + "step": 4515 + }, + { + "epoch": 2.419133634387021, + "grad_norm": 17.53402921610423, + "learning_rate": 1.0973708690161143e-07, + "logits/chosen": -0.275122731924057, + "logits/rejected": -0.12595012784004211, + "logps/chosen": -3.2308623790740967, + "logps/rejected": -4.467679023742676, + "loss": 0.4458, + "rewards/accuracies": 0.8062499761581421, + "rewards/chosen": -3.2308623790740967, + "rewards/margins": 1.2368162870407104, + "rewards/rejected": -4.467679023742676, + "sft_loss": 3.35723614692688, + "step": 4520 + }, + { + "epoch": 2.421809667168423, + "grad_norm": 24.399068677605378, + "learning_rate": 1.0876541258267119e-07, + "logits/chosen": -0.31348997354507446, + "logits/rejected": -0.07134351134300232, + "logps/chosen": -3.2641544342041016, + "logps/rejected": -4.654932975769043, + "loss": 0.4228, + "rewards/accuracies": 0.862500011920929, + "rewards/chosen": -3.2641544342041016, + "rewards/margins": 1.39077889919281, + "rewards/rejected": -4.654932975769043, + "sft_loss": 3.3911826610565186, + "step": 4525 + }, + { + "epoch": 2.4244856999498245, + "grad_norm": 19.399546547220996, + "learning_rate": 1.0779753402341379e-07, + "logits/chosen": -0.2883759140968323, + "logits/rejected": -0.15745703876018524, + "logps/chosen": -3.215301990509033, + "logps/rejected": -4.2147536277771, + "loss": 0.503, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -3.215301990509033, + "rewards/margins": 0.9994519948959351, + "rewards/rejected": -4.2147536277771, + "sft_loss": 3.293431520462036, + "step": 4530 + }, + { + "epoch": 2.427161732731226, + "grad_norm": 20.335051010407632, + "learning_rate": 1.0683346061420157e-07, + "logits/chosen": -0.15196876227855682, + "logits/rejected": -0.02178243175148964, + "logps/chosen": -3.069855213165283, + "logps/rejected": -4.323353290557861, + "loss": 0.479, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -3.069855213165283, + "rewards/margins": 1.2534980773925781, + "rewards/rejected": -4.323353290557861, + "sft_loss": 3.3428139686584473, + "step": 4535 + }, + { + "epoch": 2.4298377655126275, + "grad_norm": 17.277048138271496, + "learning_rate": 1.0587320170847874e-07, + "logits/chosen": -0.2203926295042038, + "logits/rejected": -0.07835905998945236, + "logps/chosen": -3.045024871826172, + "logps/rejected": -4.137242317199707, + "loss": 0.4971, + "rewards/accuracies": 0.768750011920929, + "rewards/chosen": -3.045024871826172, + "rewards/margins": 1.0922179222106934, + "rewards/rejected": -4.137242317199707, + "sft_loss": 3.2310516834259033, + "step": 4540 + }, + { + "epoch": 2.4325137982940293, + "grad_norm": 15.126039297100926, + "learning_rate": 1.0491676662268156e-07, + "logits/chosen": -0.16868285834789276, + "logits/rejected": -0.036202650517225266, + "logps/chosen": -3.1142802238464355, + "logps/rejected": -4.321670055389404, + "loss": 0.4742, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -3.1142802238464355, + "rewards/margins": 1.2073900699615479, + "rewards/rejected": -4.321670055389404, + "sft_loss": 3.253473997116089, + "step": 4545 + }, + { + "epoch": 2.4351898310754305, + "grad_norm": 25.445883199357247, + "learning_rate": 1.0396416463614732e-07, + "logits/chosen": -0.2712944746017456, + "logits/rejected": -0.12396593391895294, + "logps/chosen": -3.0776920318603516, + "logps/rejected": -4.236860275268555, + "loss": 0.4902, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -3.0776920318603516, + "rewards/margins": 1.1591686010360718, + "rewards/rejected": -4.236860275268555, + "sft_loss": 3.244083881378174, + "step": 4550 + }, + { + "epoch": 2.4378658638568322, + "grad_norm": 16.405844130937062, + "learning_rate": 1.0301540499102479e-07, + "logits/chosen": -0.20970351994037628, + "logits/rejected": -0.0833965465426445, + "logps/chosen": -3.313142776489258, + "logps/rejected": -4.349070072174072, + "loss": 0.5185, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": -3.313142776489258, + "rewards/margins": 1.0359264612197876, + "rewards/rejected": -4.349070072174072, + "sft_loss": 3.5295753479003906, + "step": 4555 + }, + { + "epoch": 2.440541896638234, + "grad_norm": 20.700026707905256, + "learning_rate": 1.0207049689218405e-07, + "logits/chosen": -0.26775461435317993, + "logits/rejected": -0.011146956123411655, + "logps/chosen": -3.2772324085235596, + "logps/rejected": -4.6195173263549805, + "loss": 0.4729, + "rewards/accuracies": 0.8062499761581421, + "rewards/chosen": -3.2772324085235596, + "rewards/margins": 1.342284917831421, + "rewards/rejected": -4.6195173263549805, + "sft_loss": 3.369673252105713, + "step": 4560 + }, + { + "epoch": 2.4432179294196352, + "grad_norm": 15.269201220575487, + "learning_rate": 1.0112944950712782e-07, + "logits/chosen": -0.2290545403957367, + "logits/rejected": -0.06699430197477341, + "logps/chosen": -3.1508140563964844, + "logps/rejected": -4.458221435546875, + "loss": 0.4326, + "rewards/accuracies": 0.8187500238418579, + "rewards/chosen": -3.1508140563964844, + "rewards/margins": 1.307407259941101, + "rewards/rejected": -4.458221435546875, + "sft_loss": 3.215066909790039, + "step": 4565 + }, + { + "epoch": 2.445893962201037, + "grad_norm": 19.307766819089892, + "learning_rate": 1.0019227196590174e-07, + "logits/chosen": -0.1831798106431961, + "logits/rejected": 0.01658095046877861, + "logps/chosen": -3.2275054454803467, + "logps/rejected": -4.416240692138672, + "loss": 0.4878, + "rewards/accuracies": 0.8125, + "rewards/chosen": -3.2275054454803467, + "rewards/margins": 1.1887353658676147, + "rewards/rejected": -4.416240692138672, + "sft_loss": 3.3482584953308105, + "step": 4570 + }, + { + "epoch": 2.4485699949824387, + "grad_norm": 17.31052507702184, + "learning_rate": 9.925897336100664e-08, + "logits/chosen": -0.13451997935771942, + "logits/rejected": -0.033201027661561966, + "logps/chosen": -3.103423595428467, + "logps/rejected": -4.447488307952881, + "loss": 0.4124, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -3.103423595428467, + "rewards/margins": 1.3440649509429932, + "rewards/rejected": -4.447488307952881, + "sft_loss": 3.27244234085083, + "step": 4575 + }, + { + "epoch": 2.45124602776384, + "grad_norm": 21.874282682259533, + "learning_rate": 9.832956274730946e-08, + "logits/chosen": -0.19495446979999542, + "logits/rejected": -0.11899854987859726, + "logps/chosen": -3.0447566509246826, + "logps/rejected": -4.065402984619141, + "loss": 0.5101, + "rewards/accuracies": 0.75, + "rewards/chosen": -3.0447566509246826, + "rewards/margins": 1.020646333694458, + "rewards/rejected": -4.065402984619141, + "sft_loss": 3.20434308052063, + "step": 4580 + }, + { + "epoch": 2.4539220605452416, + "grad_norm": 18.616466987766536, + "learning_rate": 9.740404914195633e-08, + "logits/chosen": -0.22558899223804474, + "logits/rejected": -0.0379658117890358, + "logps/chosen": -3.157686710357666, + "logps/rejected": -4.34769868850708, + "loss": 0.4652, + "rewards/accuracies": 0.8062499761581421, + "rewards/chosen": -3.157686710357666, + "rewards/margins": 1.190011739730835, + "rewards/rejected": -4.34769868850708, + "sft_loss": 3.3493080139160156, + "step": 4585 + }, + { + "epoch": 2.4565980933266434, + "grad_norm": 12.986218562492288, + "learning_rate": 9.648244152428392e-08, + "logits/chosen": -0.28454527258872986, + "logits/rejected": -0.11226551234722137, + "logps/chosen": -3.0807158946990967, + "logps/rejected": -4.219048500061035, + "loss": 0.4639, + "rewards/accuracies": 0.8125, + "rewards/chosen": -3.0807158946990967, + "rewards/margins": 1.1383326053619385, + "rewards/rejected": -4.219048500061035, + "sft_loss": 3.284151077270508, + "step": 4590 + }, + { + "epoch": 2.4592741261080446, + "grad_norm": 18.4713320773109, + "learning_rate": 9.556474883573379e-08, + "logits/chosen": -0.2847925126552582, + "logits/rejected": -0.12773391604423523, + "logps/chosen": -3.047091007232666, + "logps/rejected": -4.422765254974365, + "loss": 0.4642, + "rewards/accuracies": 0.78125, + "rewards/chosen": -3.047091007232666, + "rewards/margins": 1.3756738901138306, + "rewards/rejected": -4.422765254974365, + "sft_loss": 3.1898136138916016, + "step": 4595 + }, + { + "epoch": 2.4619501588894463, + "grad_norm": 13.479154984246103, + "learning_rate": 9.465097997976412e-08, + "logits/chosen": -0.25030630826950073, + "logits/rejected": 0.016726698726415634, + "logps/chosen": -3.1371049880981445, + "logps/rejected": -4.500518798828125, + "loss": 0.421, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -3.1371049880981445, + "rewards/margins": 1.363413691520691, + "rewards/rejected": -4.500518798828125, + "sft_loss": 3.3470330238342285, + "step": 4600 + }, + { + "epoch": 2.464626191670848, + "grad_norm": 17.62918485569358, + "learning_rate": 9.374114382176457e-08, + "logits/chosen": -0.2329416275024414, + "logits/rejected": -0.02414068579673767, + "logps/chosen": -3.3003411293029785, + "logps/rejected": -4.541224002838135, + "loss": 0.4822, + "rewards/accuracies": 0.768750011920929, + "rewards/chosen": -3.3003411293029785, + "rewards/margins": 1.2408829927444458, + "rewards/rejected": -4.541224002838135, + "sft_loss": 3.5061416625976562, + "step": 4605 + }, + { + "epoch": 2.46730222445225, + "grad_norm": 18.57472213271513, + "learning_rate": 9.283524918896945e-08, + "logits/chosen": -0.26498347520828247, + "logits/rejected": -0.09776406735181808, + "logps/chosen": -3.2808425426483154, + "logps/rejected": -4.538356781005859, + "loss": 0.4765, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -3.2808425426483154, + "rewards/margins": 1.2575145959854126, + "rewards/rejected": -4.538356781005859, + "sft_loss": 3.3779423236846924, + "step": 4610 + }, + { + "epoch": 2.469978257233651, + "grad_norm": 17.256523520599416, + "learning_rate": 9.193330487037232e-08, + "logits/chosen": -0.2040288746356964, + "logits/rejected": 0.01880154386162758, + "logps/chosen": -3.298178195953369, + "logps/rejected": -4.639803886413574, + "loss": 0.4535, + "rewards/accuracies": 0.793749988079071, + "rewards/chosen": -3.298178195953369, + "rewards/margins": 1.3416259288787842, + "rewards/rejected": -4.639803886413574, + "sft_loss": 3.4679908752441406, + "step": 4615 + }, + { + "epoch": 2.4726542900150528, + "grad_norm": 16.880230708349227, + "learning_rate": 9.103531961664118e-08, + "logits/chosen": -0.20273037254810333, + "logits/rejected": 0.042335350066423416, + "logps/chosen": -3.030534029006958, + "logps/rejected": -4.304599761962891, + "loss": 0.4104, + "rewards/accuracies": 0.831250011920929, + "rewards/chosen": -3.030534029006958, + "rewards/margins": 1.2740657329559326, + "rewards/rejected": -4.304599761962891, + "sft_loss": 3.2851033210754395, + "step": 4620 + }, + { + "epoch": 2.475330322796454, + "grad_norm": 17.230042736206773, + "learning_rate": 9.014130214003269e-08, + "logits/chosen": -0.2727370858192444, + "logits/rejected": -0.240513414144516, + "logps/chosen": -3.1677136421203613, + "logps/rejected": -4.453676223754883, + "loss": 0.4537, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -3.1677136421203613, + "rewards/margins": 1.285962462425232, + "rewards/rejected": -4.453676223754883, + "sft_loss": 3.2961056232452393, + "step": 4625 + }, + { + "epoch": 2.4780063555778558, + "grad_norm": 19.477289020834274, + "learning_rate": 8.925126111430848e-08, + "logits/chosen": -0.1317545622587204, + "logits/rejected": -0.0074939606711268425, + "logps/chosen": -3.152268886566162, + "logps/rejected": -4.429300308227539, + "loss": 0.4496, + "rewards/accuracies": 0.8187500238418579, + "rewards/chosen": -3.152268886566162, + "rewards/margins": 1.277031660079956, + "rewards/rejected": -4.429300308227539, + "sft_loss": 3.366619825363159, + "step": 4630 + }, + { + "epoch": 2.4806823883592575, + "grad_norm": 21.47276325117744, + "learning_rate": 8.83652051746504e-08, + "logits/chosen": -0.1325719654560089, + "logits/rejected": 0.07381542026996613, + "logps/chosen": -3.3022446632385254, + "logps/rejected": -4.655218601226807, + "loss": 0.4566, + "rewards/accuracies": 0.768750011920929, + "rewards/chosen": -3.3022446632385254, + "rewards/margins": 1.3529744148254395, + "rewards/rejected": -4.655218601226807, + "sft_loss": 3.4520957469940186, + "step": 4635 + }, + { + "epoch": 2.483358421140659, + "grad_norm": 15.970436118864976, + "learning_rate": 8.748314291757696e-08, + "logits/chosen": -0.19755138456821442, + "logits/rejected": -0.045385174453258514, + "logps/chosen": -3.224997043609619, + "logps/rejected": -4.416315078735352, + "loss": 0.4601, + "rewards/accuracies": 0.8125, + "rewards/chosen": -3.224997043609619, + "rewards/margins": 1.1913175582885742, + "rewards/rejected": -4.416315078735352, + "sft_loss": 3.3598480224609375, + "step": 4640 + }, + { + "epoch": 2.4860344539220605, + "grad_norm": 17.382939228249764, + "learning_rate": 8.660508290086032e-08, + "logits/chosen": -0.23071089386940002, + "logits/rejected": -0.038423918187618256, + "logps/chosen": -3.2272229194641113, + "logps/rejected": -4.533236503601074, + "loss": 0.4435, + "rewards/accuracies": 0.8062499761581421, + "rewards/chosen": -3.2272229194641113, + "rewards/margins": 1.306014060974121, + "rewards/rejected": -4.533236503601074, + "sft_loss": 3.416638135910034, + "step": 4645 + }, + { + "epoch": 2.488710486703462, + "grad_norm": 21.010187765561493, + "learning_rate": 8.573103364344231e-08, + "logits/chosen": -0.292992502450943, + "logits/rejected": -0.0057989866472780704, + "logps/chosen": -3.13838267326355, + "logps/rejected": -4.402870178222656, + "loss": 0.4478, + "rewards/accuracies": 0.8062499761581421, + "rewards/chosen": -3.13838267326355, + "rewards/margins": 1.2644875049591064, + "rewards/rejected": -4.402870178222656, + "sft_loss": 3.1901614665985107, + "step": 4650 + }, + { + "epoch": 2.4913865194848634, + "grad_norm": 20.868876516980052, + "learning_rate": 8.486100362535292e-08, + "logits/chosen": -0.2437940388917923, + "logits/rejected": -0.0457330122590065, + "logps/chosen": -3.2630112171173096, + "logps/rejected": -4.354058742523193, + "loss": 0.4887, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -3.2630112171173096, + "rewards/margins": 1.091047763824463, + "rewards/rejected": -4.354058742523193, + "sft_loss": 3.476834774017334, + "step": 4655 + }, + { + "epoch": 2.494062552266265, + "grad_norm": 14.923011648501324, + "learning_rate": 8.399500128762693e-08, + "logits/chosen": -0.20912370085716248, + "logits/rejected": -0.05193439871072769, + "logps/chosen": -3.2698898315429688, + "logps/rejected": -4.5080246925354, + "loss": 0.4386, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -3.2698898315429688, + "rewards/margins": 1.2381350994110107, + "rewards/rejected": -4.5080246925354, + "sft_loss": 3.34809947013855, + "step": 4660 + }, + { + "epoch": 2.496738585047667, + "grad_norm": 19.72037202498631, + "learning_rate": 8.313303503222313e-08, + "logits/chosen": -0.2273527830839157, + "logits/rejected": -0.1118924468755722, + "logps/chosen": -3.168461561203003, + "logps/rejected": -4.356649875640869, + "loss": 0.4783, + "rewards/accuracies": 0.78125, + "rewards/chosen": -3.168461561203003, + "rewards/margins": 1.1881887912750244, + "rewards/rejected": -4.356649875640869, + "sft_loss": 3.3031134605407715, + "step": 4665 + }, + { + "epoch": 2.4994146178290686, + "grad_norm": 20.934731505678513, + "learning_rate": 8.227511322194164e-08, + "logits/chosen": -0.24165849387645721, + "logits/rejected": -0.05451079457998276, + "logps/chosen": -3.0880351066589355, + "logps/rejected": -4.163828372955322, + "loss": 0.4786, + "rewards/accuracies": 0.8062499761581421, + "rewards/chosen": -3.0880351066589355, + "rewards/margins": 1.0757930278778076, + "rewards/rejected": -4.163828372955322, + "sft_loss": 3.1622893810272217, + "step": 4670 + }, + { + "epoch": 2.50209065061047, + "grad_norm": 26.497265788901736, + "learning_rate": 8.142124418034385e-08, + "logits/chosen": -0.15390422940254211, + "logits/rejected": 0.05969760939478874, + "logps/chosen": -3.1597721576690674, + "logps/rejected": -4.265137672424316, + "loss": 0.5263, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -3.1597721576690674, + "rewards/margins": 1.1053650379180908, + "rewards/rejected": -4.265137672424316, + "sft_loss": 3.3072402477264404, + "step": 4675 + }, + { + "epoch": 2.5047666833918716, + "grad_norm": 22.961685650281634, + "learning_rate": 8.057143619167073e-08, + "logits/chosen": -0.14675331115722656, + "logits/rejected": -0.010864680632948875, + "logps/chosen": -3.1171987056732178, + "logps/rejected": -4.339387893676758, + "loss": 0.48, + "rewards/accuracies": 0.78125, + "rewards/chosen": -3.1171987056732178, + "rewards/margins": 1.2221894264221191, + "rewards/rejected": -4.339387893676758, + "sft_loss": 3.211840867996216, + "step": 4680 + }, + { + "epoch": 2.507442716173273, + "grad_norm": 12.264521340299805, + "learning_rate": 7.97256975007633e-08, + "logits/chosen": -0.24845509231090546, + "logits/rejected": 0.022098522633314133, + "logps/chosen": -3.1032533645629883, + "logps/rejected": -4.3729448318481445, + "loss": 0.434, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -3.1032533645629883, + "rewards/margins": 1.2696917057037354, + "rewards/rejected": -4.3729448318481445, + "sft_loss": 3.2458713054656982, + "step": 4685 + }, + { + "epoch": 2.5101187489546746, + "grad_norm": 17.017415580056074, + "learning_rate": 7.888403631298186e-08, + "logits/chosen": -0.15666857361793518, + "logits/rejected": -0.0690535455942154, + "logps/chosen": -3.0472021102905273, + "logps/rejected": -4.24338960647583, + "loss": 0.4723, + "rewards/accuracies": 0.8062499761581421, + "rewards/chosen": -3.0472021102905273, + "rewards/margins": 1.1961876153945923, + "rewards/rejected": -4.24338960647583, + "sft_loss": 3.148030996322632, + "step": 4690 + }, + { + "epoch": 2.5127947817360763, + "grad_norm": 15.608211137207162, + "learning_rate": 7.804646079412719e-08, + "logits/chosen": -0.17254853248596191, + "logits/rejected": 0.058733534067869186, + "logps/chosen": -3.291088819503784, + "logps/rejected": -4.588837146759033, + "loss": 0.455, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -3.291088819503784, + "rewards/margins": 1.2977479696273804, + "rewards/rejected": -4.588837146759033, + "sft_loss": 3.428480863571167, + "step": 4695 + }, + { + "epoch": 2.515470814517478, + "grad_norm": 16.368817132261125, + "learning_rate": 7.72129790703604e-08, + "logits/chosen": -0.28066256642341614, + "logits/rejected": -0.10028629004955292, + "logps/chosen": -3.1184608936309814, + "logps/rejected": -4.201938629150391, + "loss": 0.4842, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -3.1184608936309814, + "rewards/margins": 1.0834776163101196, + "rewards/rejected": -4.201938629150391, + "sft_loss": 3.340056896209717, + "step": 4700 + }, + { + "epoch": 2.5181468472988793, + "grad_norm": 21.002405475983192, + "learning_rate": 7.638359922812504e-08, + "logits/chosen": -0.16717539727687836, + "logits/rejected": -0.06876533478498459, + "logps/chosen": -3.1695704460144043, + "logps/rejected": -4.404815673828125, + "loss": 0.481, + "rewards/accuracies": 0.78125, + "rewards/chosen": -3.1695704460144043, + "rewards/margins": 1.2352455854415894, + "rewards/rejected": -4.404815673828125, + "sft_loss": 3.212214708328247, + "step": 4705 + }, + { + "epoch": 2.520822880080281, + "grad_norm": 21.183191402271422, + "learning_rate": 7.555832931406774e-08, + "logits/chosen": -0.2358173429965973, + "logits/rejected": 0.015494110994040966, + "logps/chosen": -3.171396493911743, + "logps/rejected": -4.4364213943481445, + "loss": 0.454, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -3.171396493911743, + "rewards/margins": 1.2650253772735596, + "rewards/rejected": -4.4364213943481445, + "sft_loss": 3.3194046020507812, + "step": 4710 + }, + { + "epoch": 2.5234989128616827, + "grad_norm": 15.518506735899832, + "learning_rate": 7.47371773349611e-08, + "logits/chosen": -0.17813552916049957, + "logits/rejected": -0.13089530169963837, + "logps/chosen": -3.214340925216675, + "logps/rejected": -4.631179332733154, + "loss": 0.4016, + "rewards/accuracies": 0.831250011920929, + "rewards/chosen": -3.214340925216675, + "rewards/margins": 1.4168381690979004, + "rewards/rejected": -4.631179332733154, + "sft_loss": 3.372272491455078, + "step": 4715 + }, + { + "epoch": 2.526174945643084, + "grad_norm": 17.855029324563343, + "learning_rate": 7.392015125762496e-08, + "logits/chosen": -0.2225007265806198, + "logits/rejected": -0.036283355206251144, + "logps/chosen": -3.060300350189209, + "logps/rejected": -4.420039176940918, + "loss": 0.4043, + "rewards/accuracies": 0.875, + "rewards/chosen": -3.060300350189209, + "rewards/margins": 1.3597384691238403, + "rewards/rejected": -4.420039176940918, + "sft_loss": 3.1908531188964844, + "step": 4720 + }, + { + "epoch": 2.5288509784244857, + "grad_norm": 18.797398384421548, + "learning_rate": 7.310725900885018e-08, + "logits/chosen": -0.25443488359451294, + "logits/rejected": -0.1568032056093216, + "logps/chosen": -3.246868133544922, + "logps/rejected": -4.453755855560303, + "loss": 0.5097, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -3.246868133544922, + "rewards/margins": 1.2068877220153809, + "rewards/rejected": -4.453755855560303, + "sft_loss": 3.4013404846191406, + "step": 4725 + }, + { + "epoch": 2.5315270112058874, + "grad_norm": 18.674359255751934, + "learning_rate": 7.229850847532076e-08, + "logits/chosen": -0.18834123015403748, + "logits/rejected": 0.01394510269165039, + "logps/chosen": -3.13272762298584, + "logps/rejected": -4.519242286682129, + "loss": 0.4157, + "rewards/accuracies": 0.831250011920929, + "rewards/chosen": -3.13272762298584, + "rewards/margins": 1.3865143060684204, + "rewards/rejected": -4.519242286682129, + "sft_loss": 3.35652494430542, + "step": 4730 + }, + { + "epoch": 2.5342030439872887, + "grad_norm": 18.356699406304966, + "learning_rate": 7.149390750353779e-08, + "logits/chosen": -0.11418505012989044, + "logits/rejected": -0.14707979559898376, + "logps/chosen": -3.3050544261932373, + "logps/rejected": -4.507040977478027, + "loss": 0.4266, + "rewards/accuracies": 0.8374999761581421, + "rewards/chosen": -3.3050544261932373, + "rewards/margins": 1.2019859552383423, + "rewards/rejected": -4.507040977478027, + "sft_loss": 3.4293060302734375, + "step": 4735 + }, + { + "epoch": 2.5368790767686904, + "grad_norm": 13.183960579601973, + "learning_rate": 7.069346389974374e-08, + "logits/chosen": -0.23427972197532654, + "logits/rejected": -0.03690224885940552, + "logps/chosen": -3.3357748985290527, + "logps/rejected": -4.470101833343506, + "loss": 0.491, + "rewards/accuracies": 0.78125, + "rewards/chosen": -3.3357748985290527, + "rewards/margins": 1.1343269348144531, + "rewards/rejected": -4.470101833343506, + "sft_loss": 3.506957530975342, + "step": 4740 + }, + { + "epoch": 2.539555109550092, + "grad_norm": 19.94553345695387, + "learning_rate": 6.989718542984563e-08, + "logits/chosen": -0.2038155049085617, + "logits/rejected": -0.12303312122821808, + "logps/chosen": -3.2963204383850098, + "logps/rejected": -4.538512229919434, + "loss": 0.4771, + "rewards/accuracies": 0.8062499761581421, + "rewards/chosen": -3.2963204383850098, + "rewards/margins": 1.242192029953003, + "rewards/rejected": -4.538512229919434, + "sft_loss": 3.4833292961120605, + "step": 4745 + }, + { + "epoch": 2.5422311423314934, + "grad_norm": 17.211013156643542, + "learning_rate": 6.9105079819341e-08, + "logits/chosen": -0.17637403309345245, + "logits/rejected": 0.12716087698936462, + "logps/chosen": -3.2101187705993652, + "logps/rejected": -4.727436542510986, + "loss": 0.3764, + "rewards/accuracies": 0.875, + "rewards/chosen": -3.2101187705993652, + "rewards/margins": 1.517317771911621, + "rewards/rejected": -4.727436542510986, + "sft_loss": 3.330015182495117, + "step": 4750 + }, + { + "epoch": 2.544907175112895, + "grad_norm": 18.59545753629147, + "learning_rate": 6.831715475324163e-08, + "logits/chosen": -0.25424811244010925, + "logits/rejected": -0.03308358043432236, + "logps/chosen": -3.2420711517333984, + "logps/rejected": -4.640331745147705, + "loss": 0.4475, + "rewards/accuracies": 0.8125, + "rewards/chosen": -3.2420711517333984, + "rewards/margins": 1.3982599973678589, + "rewards/rejected": -4.640331745147705, + "sft_loss": 3.4349277019500732, + "step": 4755 + }, + { + "epoch": 2.547583207894297, + "grad_norm": 16.47927838456307, + "learning_rate": 6.753341787600026e-08, + "logits/chosen": -0.3032090961933136, + "logits/rejected": -0.17475828528404236, + "logps/chosen": -3.182875633239746, + "logps/rejected": -4.66715145111084, + "loss": 0.3949, + "rewards/accuracies": 0.8812500238418579, + "rewards/chosen": -3.182875633239746, + "rewards/margins": 1.4842758178710938, + "rewards/rejected": -4.66715145111084, + "sft_loss": 3.4110794067382812, + "step": 4760 + }, + { + "epoch": 2.5502592406756985, + "grad_norm": 20.809819346399998, + "learning_rate": 6.67538767914353e-08, + "logits/chosen": -0.2470838725566864, + "logits/rejected": -0.023734260350465775, + "logps/chosen": -3.3000004291534424, + "logps/rejected": -4.46333646774292, + "loss": 0.482, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": -3.3000004291534424, + "rewards/margins": 1.1633365154266357, + "rewards/rejected": -4.46333646774292, + "sft_loss": 3.459524631500244, + "step": 4765 + }, + { + "epoch": 2.5529352734571, + "grad_norm": 22.254439383465648, + "learning_rate": 6.597853906265793e-08, + "logits/chosen": -0.2009831666946411, + "logits/rejected": -0.017525160685181618, + "logps/chosen": -3.3135883808135986, + "logps/rejected": -4.76369047164917, + "loss": 0.4309, + "rewards/accuracies": 0.8125, + "rewards/chosen": -3.3135883808135986, + "rewards/margins": 1.4501022100448608, + "rewards/rejected": -4.76369047164917, + "sft_loss": 3.384169816970825, + "step": 4770 + }, + { + "epoch": 2.5556113062385015, + "grad_norm": 21.898141341281875, + "learning_rate": 6.5207412211998e-08, + "logits/chosen": -0.1036507710814476, + "logits/rejected": 0.020888470113277435, + "logps/chosen": -3.3246219158172607, + "logps/rejected": -4.676529884338379, + "loss": 0.4777, + "rewards/accuracies": 0.793749988079071, + "rewards/chosen": -3.3246219158172607, + "rewards/margins": 1.3519079685211182, + "rewards/rejected": -4.676529884338379, + "sft_loss": 3.3856797218322754, + "step": 4775 + }, + { + "epoch": 2.558287339019903, + "grad_norm": 17.081647007030167, + "learning_rate": 6.444050372093186e-08, + "logits/chosen": -0.26109281182289124, + "logits/rejected": -0.11180180311203003, + "logps/chosen": -3.223278045654297, + "logps/rejected": -4.432318687438965, + "loss": 0.4501, + "rewards/accuracies": 0.793749988079071, + "rewards/chosen": -3.223278045654297, + "rewards/margins": 1.2090413570404053, + "rewards/rejected": -4.432318687438965, + "sft_loss": 3.346792697906494, + "step": 4780 + }, + { + "epoch": 2.5609633718013045, + "grad_norm": 19.517956039813384, + "learning_rate": 6.367782103000873e-08, + "logits/chosen": -0.20629271864891052, + "logits/rejected": -0.1311255395412445, + "logps/chosen": -3.2419943809509277, + "logps/rejected": -4.231849670410156, + "loss": 0.5059, + "rewards/accuracies": 0.8062499761581421, + "rewards/chosen": -3.2419943809509277, + "rewards/margins": 0.9898551106452942, + "rewards/rejected": -4.231849670410156, + "sft_loss": 3.343217372894287, + "step": 4785 + }, + { + "epoch": 2.5636394045827062, + "grad_norm": 17.066269107224198, + "learning_rate": 6.29193715387798e-08, + "logits/chosen": -0.2514341175556183, + "logits/rejected": -0.09561355412006378, + "logps/chosen": -3.2857754230499268, + "logps/rejected": -4.550307273864746, + "loss": 0.4694, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -3.2857754230499268, + "rewards/margins": 1.2645318508148193, + "rewards/rejected": -4.550307273864746, + "sft_loss": 3.3797073364257812, + "step": 4790 + }, + { + "epoch": 2.566315437364108, + "grad_norm": 26.19436760931643, + "learning_rate": 6.216516260572502e-08, + "logits/chosen": -0.19456951320171356, + "logits/rejected": -0.04334023594856262, + "logps/chosen": -3.3537120819091797, + "logps/rejected": -4.570885181427002, + "loss": 0.49, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -3.3537120819091797, + "rewards/margins": 1.2171725034713745, + "rewards/rejected": -4.570885181427002, + "sft_loss": 3.438067674636841, + "step": 4795 + }, + { + "epoch": 2.568991470145509, + "grad_norm": 15.876151554491901, + "learning_rate": 6.141520154818297e-08, + "logits/chosen": -0.23168060183525085, + "logits/rejected": -0.09211653470993042, + "logps/chosen": -3.15653395652771, + "logps/rejected": -4.232216835021973, + "loss": 0.4952, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": -3.15653395652771, + "rewards/margins": 1.0756828784942627, + "rewards/rejected": -4.232216835021973, + "sft_loss": 3.413440227508545, + "step": 4800 + }, + { + "epoch": 2.568991470145509, + "eval_logits/chosen": 0.1935766041278839, + "eval_logits/rejected": 0.32095426321029663, + "eval_logps/chosen": -3.3064892292022705, + "eval_logps/rejected": -4.356972694396973, + "eval_loss": 0.5579966306686401, + "eval_rewards/accuracies": 0.7232937812805176, + "eval_rewards/chosen": -3.3064892292022705, + "eval_rewards/margins": 1.050482988357544, + "eval_rewards/rejected": -4.356972694396973, + "eval_runtime": 49.257, + "eval_samples_per_second": 27.306, + "eval_sft_loss": 3.4401662349700928, + "eval_steps_per_second": 6.842, + "step": 4800 + }, + { + "epoch": 2.571667502926911, + "grad_norm": 25.098344738779687, + "learning_rate": 6.066949564227897e-08, + "logits/chosen": -0.2947044372558594, + "logits/rejected": -0.14726006984710693, + "logps/chosen": -3.2008023262023926, + "logps/rejected": -4.374998092651367, + "loss": 0.506, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": -3.2008023262023926, + "rewards/margins": 1.1741957664489746, + "rewards/rejected": -4.374998092651367, + "sft_loss": 3.3073036670684814, + "step": 4805 + }, + { + "epoch": 2.574343535708312, + "grad_norm": 17.540367301185075, + "learning_rate": 5.992805212285523e-08, + "logits/chosen": -0.23283466696739197, + "logits/rejected": -0.10140033066272736, + "logps/chosen": -3.1717512607574463, + "logps/rejected": -4.4409589767456055, + "loss": 0.4676, + "rewards/accuracies": 0.793749988079071, + "rewards/chosen": -3.1717512607574463, + "rewards/margins": 1.2692081928253174, + "rewards/rejected": -4.4409589767456055, + "sft_loss": 3.3278002738952637, + "step": 4810 + }, + { + "epoch": 2.577019568489714, + "grad_norm": 22.638111065501505, + "learning_rate": 5.9190878183399684e-08, + "logits/chosen": -0.2398640662431717, + "logits/rejected": -0.06665889918804169, + "logps/chosen": -2.9983909130096436, + "logps/rejected": -4.388098239898682, + "loss": 0.4861, + "rewards/accuracies": 0.75, + "rewards/chosen": -2.9983909130096436, + "rewards/margins": 1.3897074460983276, + "rewards/rejected": -4.388098239898682, + "sft_loss": 3.2251102924346924, + "step": 4815 + }, + { + "epoch": 2.5796956012711156, + "grad_norm": 25.42150384905573, + "learning_rate": 5.845798097597748e-08, + "logits/chosen": -0.19961199164390564, + "logits/rejected": -0.07595672458410263, + "logps/chosen": -3.2427210807800293, + "logps/rejected": -4.271923542022705, + "loss": 0.494, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -3.2427210807800293, + "rewards/margins": 1.029201626777649, + "rewards/rejected": -4.271923542022705, + "sft_loss": 3.2815985679626465, + "step": 4820 + }, + { + "epoch": 2.5823716340525174, + "grad_norm": 18.515170835784442, + "learning_rate": 5.772936761116026e-08, + "logits/chosen": -0.19458623230457306, + "logits/rejected": -0.0016370117664337158, + "logps/chosen": -3.1516411304473877, + "logps/rejected": -4.336785793304443, + "loss": 0.4557, + "rewards/accuracies": 0.8187500238418579, + "rewards/chosen": -3.1516411304473877, + "rewards/margins": 1.1851444244384766, + "rewards/rejected": -4.336785793304443, + "sft_loss": 3.2261505126953125, + "step": 4825 + }, + { + "epoch": 2.5850476668339186, + "grad_norm": 23.696243181872738, + "learning_rate": 5.700504515795829e-08, + "logits/chosen": -0.2503166198730469, + "logits/rejected": -0.04413991421461105, + "logps/chosen": -3.2913098335266113, + "logps/rejected": -4.465832710266113, + "loss": 0.4647, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -3.2913098335266113, + "rewards/margins": 1.1745226383209229, + "rewards/rejected": -4.465832710266113, + "sft_loss": 3.4344570636749268, + "step": 4830 + }, + { + "epoch": 2.5877236996153203, + "grad_norm": 19.82750433438805, + "learning_rate": 5.628502064375101e-08, + "logits/chosen": -0.35966330766677856, + "logits/rejected": -0.12091150134801865, + "logps/chosen": -3.070188283920288, + "logps/rejected": -4.504461288452148, + "loss": 0.4042, + "rewards/accuracies": 0.875, + "rewards/chosen": -3.070188283920288, + "rewards/margins": 1.4342727661132812, + "rewards/rejected": -4.504461288452148, + "sft_loss": 3.1828949451446533, + "step": 4835 + }, + { + "epoch": 2.5903997323967216, + "grad_norm": 23.980809155950052, + "learning_rate": 5.55693010542197e-08, + "logits/chosen": -0.3103236258029938, + "logits/rejected": -0.028705209493637085, + "logps/chosen": -3.0640907287597656, + "logps/rejected": -4.454373359680176, + "loss": 0.3997, + "rewards/accuracies": 0.8187500238418579, + "rewards/chosen": -3.0640907287597656, + "rewards/margins": 1.3902822732925415, + "rewards/rejected": -4.454373359680176, + "sft_loss": 3.190805435180664, + "step": 4840 + }, + { + "epoch": 2.5930757651781233, + "grad_norm": 19.274243518176583, + "learning_rate": 5.485789333327856e-08, + "logits/chosen": -0.2074917107820511, + "logits/rejected": -0.13893333077430725, + "logps/chosen": -3.134070634841919, + "logps/rejected": -4.3168511390686035, + "loss": 0.4809, + "rewards/accuracies": 0.75, + "rewards/chosen": -3.134070634841919, + "rewards/margins": 1.1827805042266846, + "rewards/rejected": -4.3168511390686035, + "sft_loss": 3.336045026779175, + "step": 4845 + }, + { + "epoch": 2.595751797959525, + "grad_norm": 21.548530065007366, + "learning_rate": 5.4150804383008675e-08, + "logits/chosen": -0.3585580289363861, + "logits/rejected": -0.15167434513568878, + "logps/chosen": -3.2743637561798096, + "logps/rejected": -4.633362770080566, + "loss": 0.4475, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -3.2743637561798096, + "rewards/margins": 1.3589991331100464, + "rewards/rejected": -4.633362770080566, + "sft_loss": 3.377138614654541, + "step": 4850 + }, + { + "epoch": 2.5984278307409268, + "grad_norm": 19.91785176089767, + "learning_rate": 5.344804106359002e-08, + "logits/chosen": -0.1923711746931076, + "logits/rejected": 0.0029977380763739347, + "logps/chosen": -3.04135799407959, + "logps/rejected": -4.348074436187744, + "loss": 0.4699, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -3.04135799407959, + "rewards/margins": 1.3067158460617065, + "rewards/rejected": -4.348074436187744, + "sft_loss": 3.242629289627075, + "step": 4855 + }, + { + "epoch": 2.601103863522328, + "grad_norm": 22.53903758877158, + "learning_rate": 5.274961019323559e-08, + "logits/chosen": -0.2610209584236145, + "logits/rejected": -0.14107844233512878, + "logps/chosen": -2.971292495727539, + "logps/rejected": -4.198145866394043, + "loss": 0.4488, + "rewards/accuracies": 0.8125, + "rewards/chosen": -2.971292495727539, + "rewards/margins": 1.226853370666504, + "rewards/rejected": -4.198145866394043, + "sft_loss": 3.202307939529419, + "step": 4860 + }, + { + "epoch": 2.6037798963037297, + "grad_norm": 10.943657723423541, + "learning_rate": 5.205551854812451e-08, + "logits/chosen": -0.3178286552429199, + "logits/rejected": -0.18322348594665527, + "logps/chosen": -3.2609658241271973, + "logps/rejected": -4.589636325836182, + "loss": 0.4319, + "rewards/accuracies": 0.8374999761581421, + "rewards/chosen": -3.2609658241271973, + "rewards/margins": 1.328670859336853, + "rewards/rejected": -4.589636325836182, + "sft_loss": 3.3975632190704346, + "step": 4865 + }, + { + "epoch": 2.606455929085131, + "grad_norm": 16.361947175383595, + "learning_rate": 5.1365772862337177e-08, + "logits/chosen": -0.20097629725933075, + "logits/rejected": -0.04026917368173599, + "logps/chosen": -2.9677042961120605, + "logps/rejected": -4.50205659866333, + "loss": 0.3729, + "rewards/accuracies": 0.875, + "rewards/chosen": -2.9677042961120605, + "rewards/margins": 1.5343520641326904, + "rewards/rejected": -4.50205659866333, + "sft_loss": 3.053894519805908, + "step": 4870 + }, + { + "epoch": 2.6091319618665327, + "grad_norm": 24.718626158852302, + "learning_rate": 5.068037982778905e-08, + "logits/chosen": -0.16204114258289337, + "logits/rejected": -0.04081437736749649, + "logps/chosen": -3.022818088531494, + "logps/rejected": -4.330333232879639, + "loss": 0.4815, + "rewards/accuracies": 0.78125, + "rewards/chosen": -3.022818088531494, + "rewards/margins": 1.307515263557434, + "rewards/rejected": -4.330333232879639, + "sft_loss": 3.2373275756835938, + "step": 4875 + }, + { + "epoch": 2.6118079946479344, + "grad_norm": 13.09594561850234, + "learning_rate": 4.999934609416656e-08, + "logits/chosen": -0.11323384195566177, + "logits/rejected": 0.052326686680316925, + "logps/chosen": -3.061737537384033, + "logps/rejected": -4.523990631103516, + "loss": 0.4264, + "rewards/accuracies": 0.8062499761581421, + "rewards/chosen": -3.061737537384033, + "rewards/margins": 1.4622533321380615, + "rewards/rejected": -4.523990631103516, + "sft_loss": 3.2748007774353027, + "step": 4880 + }, + { + "epoch": 2.614484027429336, + "grad_norm": 17.96320824985329, + "learning_rate": 4.932267826886183e-08, + "logits/chosen": -0.1457149088382721, + "logits/rejected": -0.06468679010868073, + "logps/chosen": -3.215524196624756, + "logps/rejected": -4.595486640930176, + "loss": 0.4412, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -3.215524196624756, + "rewards/margins": 1.3799618482589722, + "rewards/rejected": -4.595486640930176, + "sft_loss": 3.408809185028076, + "step": 4885 + }, + { + "epoch": 2.6171600602107374, + "grad_norm": 18.882118102463277, + "learning_rate": 4.8650382916909206e-08, + "logits/chosen": -0.32069796323776245, + "logits/rejected": -0.09536401927471161, + "logps/chosen": -3.2022228240966797, + "logps/rejected": -4.488520622253418, + "loss": 0.4706, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -3.2022228240966797, + "rewards/margins": 1.2862979173660278, + "rewards/rejected": -4.488520622253418, + "sft_loss": 3.4009814262390137, + "step": 4890 + }, + { + "epoch": 2.619836092992139, + "grad_norm": 15.427171984599685, + "learning_rate": 4.7982466560920976e-08, + "logits/chosen": -0.21969299018383026, + "logits/rejected": -0.10536585748195648, + "logps/chosen": -3.188559055328369, + "logps/rejected": -4.181743621826172, + "loss": 0.5336, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -3.188559055328369, + "rewards/margins": 0.9931844472885132, + "rewards/rejected": -4.181743621826172, + "sft_loss": 3.355111598968506, + "step": 4895 + }, + { + "epoch": 2.622512125773541, + "grad_norm": 21.961264722235224, + "learning_rate": 4.7318935681024685e-08, + "logits/chosen": -0.15709593892097473, + "logits/rejected": 0.04801901429891586, + "logps/chosen": -3.1420769691467285, + "logps/rejected": -4.425694942474365, + "loss": 0.4329, + "rewards/accuracies": 0.831250011920929, + "rewards/chosen": -3.1420769691467285, + "rewards/margins": 1.2836177349090576, + "rewards/rejected": -4.425694942474365, + "sft_loss": 3.3105030059814453, + "step": 4900 + }, + { + "epoch": 2.625188158554942, + "grad_norm": 15.599904597557535, + "learning_rate": 4.6659796714799745e-08, + "logits/chosen": -0.1877932846546173, + "logits/rejected": 0.018878992646932602, + "logps/chosen": -3.198349714279175, + "logps/rejected": -4.627870082855225, + "loss": 0.4091, + "rewards/accuracies": 0.84375, + "rewards/chosen": -3.198349714279175, + "rewards/margins": 1.4295203685760498, + "rewards/rejected": -4.627870082855225, + "sft_loss": 3.4322593212127686, + "step": 4905 + }, + { + "epoch": 2.627864191336344, + "grad_norm": 16.54048979952472, + "learning_rate": 4.60050560572155e-08, + "logits/chosen": -0.22201958298683167, + "logits/rejected": -0.22817635536193848, + "logps/chosen": -3.1096553802490234, + "logps/rejected": -4.568678855895996, + "loss": 0.4544, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -3.1096553802490234, + "rewards/margins": 1.45902419090271, + "rewards/rejected": -4.568678855895996, + "sft_loss": 3.2503561973571777, + "step": 4910 + }, + { + "epoch": 2.6305402241177456, + "grad_norm": 21.385888084225407, + "learning_rate": 4.535472006056834e-08, + "logits/chosen": -0.22342491149902344, + "logits/rejected": -0.040875911712646484, + "logps/chosen": -3.0525319576263428, + "logps/rejected": -4.305843353271484, + "loss": 0.46, + "rewards/accuracies": 0.768750011920929, + "rewards/chosen": -3.0525319576263428, + "rewards/margins": 1.2533115148544312, + "rewards/rejected": -4.305843353271484, + "sft_loss": 3.2571754455566406, + "step": 4915 + }, + { + "epoch": 2.6332162568991473, + "grad_norm": 18.92057715118689, + "learning_rate": 4.470879503442132e-08, + "logits/chosen": -0.23863354325294495, + "logits/rejected": -0.08575797080993652, + "logps/chosen": -3.198439598083496, + "logps/rejected": -4.544912338256836, + "loss": 0.4611, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -3.198439598083496, + "rewards/margins": 1.3464728593826294, + "rewards/rejected": -4.544912338256836, + "sft_loss": 3.3559088706970215, + "step": 4920 + }, + { + "epoch": 2.6358922896805486, + "grad_norm": 16.23594963751572, + "learning_rate": 4.406728724554154e-08, + "logits/chosen": -0.36170822381973267, + "logits/rejected": -0.05511064454913139, + "logps/chosen": -3.168303966522217, + "logps/rejected": -4.5181884765625, + "loss": 0.4467, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -3.168303966522217, + "rewards/margins": 1.3498847484588623, + "rewards/rejected": -4.5181884765625, + "sft_loss": 3.367772340774536, + "step": 4925 + }, + { + "epoch": 2.6385683224619503, + "grad_norm": 15.157398299332206, + "learning_rate": 4.3430202917840664e-08, + "logits/chosen": -0.2529342770576477, + "logits/rejected": -0.003716734703630209, + "logps/chosen": -3.219101667404175, + "logps/rejected": -4.630505561828613, + "loss": 0.4364, + "rewards/accuracies": 0.8187500238418579, + "rewards/chosen": -3.219101667404175, + "rewards/margins": 1.4114038944244385, + "rewards/rejected": -4.630505561828613, + "sft_loss": 3.2867751121520996, + "step": 4930 + }, + { + "epoch": 2.6412443552433515, + "grad_norm": 25.37551161557867, + "learning_rate": 4.279754823231346e-08, + "logits/chosen": -0.2770128846168518, + "logits/rejected": -0.029742831364274025, + "logps/chosen": -3.149597644805908, + "logps/rejected": -4.365159034729004, + "loss": 0.4824, + "rewards/accuracies": 0.768750011920929, + "rewards/chosen": -3.149597644805908, + "rewards/margins": 1.2155616283416748, + "rewards/rejected": -4.365159034729004, + "sft_loss": 3.2964344024658203, + "step": 4935 + }, + { + "epoch": 2.6439203880247533, + "grad_norm": 15.297478140529776, + "learning_rate": 4.216932932697859e-08, + "logits/chosen": -0.2701972723007202, + "logits/rejected": -0.12614202499389648, + "logps/chosen": -3.1263914108276367, + "logps/rejected": -4.205435752868652, + "loss": 0.4734, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -3.1263914108276367, + "rewards/margins": 1.0790444612503052, + "rewards/rejected": -4.205435752868652, + "sft_loss": 3.333831787109375, + "step": 4940 + }, + { + "epoch": 2.646596420806155, + "grad_norm": 22.327646162788575, + "learning_rate": 4.154555229681844e-08, + "logits/chosen": -0.23707015812397003, + "logits/rejected": 0.0013411410618573427, + "logps/chosen": -3.1890475749969482, + "logps/rejected": -4.5590715408325195, + "loss": 0.4182, + "rewards/accuracies": 0.8374999761581421, + "rewards/chosen": -3.1890475749969482, + "rewards/margins": 1.3700246810913086, + "rewards/rejected": -4.5590715408325195, + "sft_loss": 3.3222873210906982, + "step": 4945 + }, + { + "epoch": 2.6492724535875567, + "grad_norm": 24.20219575368579, + "learning_rate": 4.092622319372069e-08, + "logits/chosen": -0.19819870591163635, + "logits/rejected": 0.018138539046049118, + "logps/chosen": -3.139855146408081, + "logps/rejected": -4.3406782150268555, + "loss": 0.5064, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -3.139855146408081, + "rewards/margins": 1.200823187828064, + "rewards/rejected": -4.3406782150268555, + "sft_loss": 3.278446912765503, + "step": 4950 + }, + { + "epoch": 2.651948486368958, + "grad_norm": 22.468355517091446, + "learning_rate": 4.031134802641889e-08, + "logits/chosen": -0.2404346764087677, + "logits/rejected": -0.163672536611557, + "logps/chosen": -3.2455620765686035, + "logps/rejected": -4.417026519775391, + "loss": 0.4541, + "rewards/accuracies": 0.831250011920929, + "rewards/chosen": -3.2455620765686035, + "rewards/margins": 1.1714643239974976, + "rewards/rejected": -4.417026519775391, + "sft_loss": 3.4159579277038574, + "step": 4955 + }, + { + "epoch": 2.6546245191503597, + "grad_norm": 16.006458569884785, + "learning_rate": 3.970093276043468e-08, + "logits/chosen": -0.20295009016990662, + "logits/rejected": -0.056883059442043304, + "logps/chosen": -3.079036235809326, + "logps/rejected": -4.43948221206665, + "loss": 0.4306, + "rewards/accuracies": 0.8187500238418579, + "rewards/chosen": -3.079036235809326, + "rewards/margins": 1.3604458570480347, + "rewards/rejected": -4.43948221206665, + "sft_loss": 3.2619762420654297, + "step": 4960 + }, + { + "epoch": 2.657300551931761, + "grad_norm": 21.707948896911393, + "learning_rate": 3.9094983318019584e-08, + "logits/chosen": -0.28873950242996216, + "logits/rejected": -0.11398355662822723, + "logps/chosen": -3.094452381134033, + "logps/rejected": -4.441015720367432, + "loss": 0.4314, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -3.094452381134033, + "rewards/margins": 1.3465631008148193, + "rewards/rejected": -4.441015720367432, + "sft_loss": 3.330700397491455, + "step": 4965 + }, + { + "epoch": 2.6599765847131627, + "grad_norm": 16.677700003919625, + "learning_rate": 3.849350557809789e-08, + "logits/chosen": -0.14867134392261505, + "logits/rejected": -0.04668787866830826, + "logps/chosen": -2.994612693786621, + "logps/rejected": -4.390146255493164, + "loss": 0.4199, + "rewards/accuracies": 0.8374999761581421, + "rewards/chosen": -2.994612693786621, + "rewards/margins": 1.395533800125122, + "rewards/rejected": -4.390146255493164, + "sft_loss": 3.0476841926574707, + "step": 4970 + }, + { + "epoch": 2.6626526174945644, + "grad_norm": 17.366292153784123, + "learning_rate": 3.789650537620903e-08, + "logits/chosen": -0.18064144253730774, + "logits/rejected": -0.10504363477230072, + "logps/chosen": -3.2243492603302, + "logps/rejected": -4.436784744262695, + "loss": 0.4472, + "rewards/accuracies": 0.8062499761581421, + "rewards/chosen": -3.2243492603302, + "rewards/margins": 1.2124359607696533, + "rewards/rejected": -4.436784744262695, + "sft_loss": 3.332610607147217, + "step": 4975 + }, + { + "epoch": 2.665328650275966, + "grad_norm": 17.975524302441073, + "learning_rate": 3.730398850445182e-08, + "logits/chosen": -0.11991802603006363, + "logits/rejected": -0.06612943112850189, + "logps/chosen": -3.369271755218506, + "logps/rejected": -4.555083274841309, + "loss": 0.4937, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -3.369271755218506, + "rewards/margins": 1.1858118772506714, + "rewards/rejected": -4.555083274841309, + "sft_loss": 3.3891067504882812, + "step": 4980 + }, + { + "epoch": 2.6680046830573674, + "grad_norm": 18.379547604926636, + "learning_rate": 3.671596071142735e-08, + "logits/chosen": -0.1973075568675995, + "logits/rejected": 0.03738722950220108, + "logps/chosen": -3.0913569927215576, + "logps/rejected": -4.440980434417725, + "loss": 0.4823, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -3.0913569927215576, + "rewards/margins": 1.349623441696167, + "rewards/rejected": -4.440980434417725, + "sft_loss": 3.18648362159729, + "step": 4985 + }, + { + "epoch": 2.670680715838769, + "grad_norm": 18.614410610882675, + "learning_rate": 3.6132427702183996e-08, + "logits/chosen": -0.3264130651950836, + "logits/rejected": -0.11211379617452621, + "logps/chosen": -3.138422966003418, + "logps/rejected": -4.492595672607422, + "loss": 0.42, + "rewards/accuracies": 0.8187500238418579, + "rewards/chosen": -3.138422966003418, + "rewards/margins": 1.3541723489761353, + "rewards/rejected": -4.492595672607422, + "sft_loss": 3.29201078414917, + "step": 4990 + }, + { + "epoch": 2.6733567486201704, + "grad_norm": 16.98717184658078, + "learning_rate": 3.555339513816147e-08, + "logits/chosen": -0.2549840807914734, + "logits/rejected": -0.23621661961078644, + "logps/chosen": -3.2311882972717285, + "logps/rejected": -4.25803279876709, + "loss": 0.5272, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -3.2311882972717285, + "rewards/margins": 1.026845097541809, + "rewards/rejected": -4.25803279876709, + "sft_loss": 3.410748243331909, + "step": 4995 + }, + { + "epoch": 2.676032781401572, + "grad_norm": 19.140590465296466, + "learning_rate": 3.497886863713639e-08, + "logits/chosen": -0.23707112669944763, + "logits/rejected": -0.17840634286403656, + "logps/chosen": -3.2316176891326904, + "logps/rejected": -4.3862199783325195, + "loss": 0.515, + "rewards/accuracies": 0.768750011920929, + "rewards/chosen": -3.2316176891326904, + "rewards/margins": 1.1546024084091187, + "rewards/rejected": -4.3862199783325195, + "sft_loss": 3.41558575630188, + "step": 5000 + }, + { + "epoch": 2.678708814182974, + "grad_norm": 19.91206142487075, + "learning_rate": 3.440885377316721e-08, + "logits/chosen": -0.18267245590686798, + "logits/rejected": -0.09775508940219879, + "logps/chosen": -3.204019546508789, + "logps/rejected": -4.2952752113342285, + "loss": 0.4578, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -3.204019546508789, + "rewards/margins": 1.091255784034729, + "rewards/rejected": -4.2952752113342285, + "sft_loss": 3.3476028442382812, + "step": 5005 + }, + { + "epoch": 2.6813848469643755, + "grad_norm": 20.561056582271576, + "learning_rate": 3.384335607654082e-08, + "logits/chosen": -0.16455042362213135, + "logits/rejected": -0.049335233867168427, + "logps/chosen": -3.2577567100524902, + "logps/rejected": -4.439981460571289, + "loss": 0.4627, + "rewards/accuracies": 0.831250011920929, + "rewards/chosen": -3.2577567100524902, + "rewards/margins": 1.1822245121002197, + "rewards/rejected": -4.439981460571289, + "sft_loss": 3.3594517707824707, + "step": 5010 + }, + { + "epoch": 2.684060879745777, + "grad_norm": 20.92663548817318, + "learning_rate": 3.328238103371811e-08, + "logits/chosen": -0.24483826756477356, + "logits/rejected": -0.14432474970817566, + "logps/chosen": -3.205540180206299, + "logps/rejected": -4.408441066741943, + "loss": 0.4572, + "rewards/accuracies": 0.8187500238418579, + "rewards/chosen": -3.205540180206299, + "rewards/margins": 1.2029006481170654, + "rewards/rejected": -4.408441066741943, + "sft_loss": 3.2874832153320312, + "step": 5015 + }, + { + "epoch": 2.6867369125271785, + "grad_norm": 24.79855818848997, + "learning_rate": 3.272593408728169e-08, + "logits/chosen": -0.28618526458740234, + "logits/rejected": 0.0043426095508039, + "logps/chosen": -3.109508991241455, + "logps/rejected": -4.331753730773926, + "loss": 0.4717, + "rewards/accuracies": 0.78125, + "rewards/chosen": -3.109508991241455, + "rewards/margins": 1.2222447395324707, + "rewards/rejected": -4.331753730773926, + "sft_loss": 3.335289478302002, + "step": 5020 + }, + { + "epoch": 2.6894129453085798, + "grad_norm": 16.22444594329588, + "learning_rate": 3.217402063588204e-08, + "logits/chosen": -0.2819980978965759, + "logits/rejected": -0.08756112307310104, + "logps/chosen": -3.1473426818847656, + "logps/rejected": -4.334247589111328, + "loss": 0.4742, + "rewards/accuracies": 0.793749988079071, + "rewards/chosen": -3.1473426818847656, + "rewards/margins": 1.1869052648544312, + "rewards/rejected": -4.334247589111328, + "sft_loss": 3.2710742950439453, + "step": 5025 + }, + { + "epoch": 2.6920889780899815, + "grad_norm": 14.71109917318996, + "learning_rate": 3.162664603418608e-08, + "logits/chosen": -0.2247733771800995, + "logits/rejected": -0.10569185018539429, + "logps/chosen": -3.121309995651245, + "logps/rejected": -4.529058933258057, + "loss": 0.4511, + "rewards/accuracies": 0.831250011920929, + "rewards/chosen": -3.121309995651245, + "rewards/margins": 1.4077484607696533, + "rewards/rejected": -4.529058933258057, + "sft_loss": 3.2245278358459473, + "step": 5030 + }, + { + "epoch": 2.694765010871383, + "grad_norm": 25.465988896577702, + "learning_rate": 3.1083815592824416e-08, + "logits/chosen": -0.2195226401090622, + "logits/rejected": -0.06730414927005768, + "logps/chosen": -3.2314651012420654, + "logps/rejected": -4.430628776550293, + "loss": 0.4673, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -3.2314651012420654, + "rewards/margins": 1.1991630792617798, + "rewards/rejected": -4.430628776550293, + "sft_loss": 3.365612506866455, + "step": 5035 + }, + { + "epoch": 2.697441043652785, + "grad_norm": 17.294025816863705, + "learning_rate": 3.054553457834053e-08, + "logits/chosen": -0.03186682611703873, + "logits/rejected": -0.05140848085284233, + "logps/chosen": -3.281543254852295, + "logps/rejected": -4.466519355773926, + "loss": 0.4721, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -3.281543254852295, + "rewards/margins": 1.184975504875183, + "rewards/rejected": -4.466519355773926, + "sft_loss": 3.3809001445770264, + "step": 5040 + }, + { + "epoch": 2.700117076434186, + "grad_norm": 22.62680292614812, + "learning_rate": 3.0011808213139036e-08, + "logits/chosen": -0.12738078832626343, + "logits/rejected": -0.1271544247865677, + "logps/chosen": -3.167606830596924, + "logps/rejected": -4.394031524658203, + "loss": 0.4707, + "rewards/accuracies": 0.78125, + "rewards/chosen": -3.167606830596924, + "rewards/margins": 1.2264244556427002, + "rewards/rejected": -4.394031524658203, + "sft_loss": 3.285980224609375, + "step": 5045 + }, + { + "epoch": 2.702793109215588, + "grad_norm": 17.79576892462079, + "learning_rate": 2.948264167543568e-08, + "logits/chosen": -0.2033161222934723, + "logits/rejected": -0.09692879766225815, + "logps/chosen": -2.983433246612549, + "logps/rejected": -4.184727668762207, + "loss": 0.4358, + "rewards/accuracies": 0.8187500238418579, + "rewards/chosen": -2.983433246612549, + "rewards/margins": 1.2012943029403687, + "rewards/rejected": -4.184727668762207, + "sft_loss": 3.1466856002807617, + "step": 5050 + }, + { + "epoch": 2.7054691419969896, + "grad_norm": 17.513471774648877, + "learning_rate": 2.8958040099206216e-08, + "logits/chosen": -0.33082449436187744, + "logits/rejected": -0.20554928481578827, + "logps/chosen": -2.940002679824829, + "logps/rejected": -4.253390312194824, + "loss": 0.4264, + "rewards/accuracies": 0.8125, + "rewards/chosen": -2.940002679824829, + "rewards/margins": 1.3133876323699951, + "rewards/rejected": -4.253390312194824, + "sft_loss": 3.0926265716552734, + "step": 5055 + }, + { + "epoch": 2.708145174778391, + "grad_norm": 22.424994647222533, + "learning_rate": 2.843800857413775e-08, + "logits/chosen": -0.17994250357151031, + "logits/rejected": -0.08553121984004974, + "logps/chosen": -3.0669708251953125, + "logps/rejected": -4.204202651977539, + "loss": 0.4954, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -3.0669708251953125, + "rewards/margins": 1.137231469154358, + "rewards/rejected": -4.204202651977539, + "sft_loss": 3.2526612281799316, + "step": 5060 + }, + { + "epoch": 2.7108212075597926, + "grad_norm": 19.948217087692427, + "learning_rate": 2.7922552145578203e-08, + "logits/chosen": -0.22585833072662354, + "logits/rejected": 0.07776106148958206, + "logps/chosen": -3.027068614959717, + "logps/rejected": -4.328622341156006, + "loss": 0.4434, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -3.027068614959717, + "rewards/margins": 1.30155348777771, + "rewards/rejected": -4.328622341156006, + "sft_loss": 3.1880125999450684, + "step": 5065 + }, + { + "epoch": 2.7134972403411943, + "grad_norm": 22.78413021377979, + "learning_rate": 2.7411675814488277e-08, + "logits/chosen": -0.1200130432844162, + "logits/rejected": 0.06273828446865082, + "logps/chosen": -3.0594770908355713, + "logps/rejected": -4.220132350921631, + "loss": 0.4505, + "rewards/accuracies": 0.8125, + "rewards/chosen": -3.0594770908355713, + "rewards/margins": 1.1606553792953491, + "rewards/rejected": -4.220132350921631, + "sft_loss": 3.2853927612304688, + "step": 5070 + }, + { + "epoch": 2.7161732731225956, + "grad_norm": 26.089826211640414, + "learning_rate": 2.690538453739216e-08, + "logits/chosen": -0.17602473497390747, + "logits/rejected": -0.085169717669487, + "logps/chosen": -3.109769105911255, + "logps/rejected": -4.0252203941345215, + "loss": 0.5595, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -3.109769105911255, + "rewards/margins": 0.9154506921768188, + "rewards/rejected": -4.0252203941345215, + "sft_loss": 3.303795576095581, + "step": 5075 + }, + { + "epoch": 2.7188493059039973, + "grad_norm": 16.519227680810705, + "learning_rate": 2.6403683226330298e-08, + "logits/chosen": -0.2547515034675598, + "logits/rejected": -0.05338859558105469, + "logps/chosen": -3.140300750732422, + "logps/rejected": -4.311095714569092, + "loss": 0.4865, + "rewards/accuracies": 0.78125, + "rewards/chosen": -3.140300750732422, + "rewards/margins": 1.1707950830459595, + "rewards/rejected": -4.311095714569092, + "sft_loss": 3.2734668254852295, + "step": 5080 + }, + { + "epoch": 2.721525338685399, + "grad_norm": 36.29648134115861, + "learning_rate": 2.5906576748810804e-08, + "logits/chosen": -0.26947951316833496, + "logits/rejected": -0.11737142503261566, + "logps/chosen": -3.053846836090088, + "logps/rejected": -4.512036323547363, + "loss": 0.413, + "rewards/accuracies": 0.84375, + "rewards/chosen": -3.053846836090088, + "rewards/margins": 1.4581893682479858, + "rewards/rejected": -4.512036323547363, + "sft_loss": 3.2287800312042236, + "step": 5085 + }, + { + "epoch": 2.7242013714668003, + "grad_norm": 22.703866516058344, + "learning_rate": 2.5414069927763016e-08, + "logits/chosen": -0.3520892262458801, + "logits/rejected": -0.11236388981342316, + "logps/chosen": -3.2202491760253906, + "logps/rejected": -4.552691459655762, + "loss": 0.439, + "rewards/accuracies": 0.793749988079071, + "rewards/chosen": -3.2202491760253906, + "rewards/margins": 1.3324424028396606, + "rewards/rejected": -4.552691459655762, + "sft_loss": 3.3652408123016357, + "step": 5090 + }, + { + "epoch": 2.726877404248202, + "grad_norm": 15.012282479125542, + "learning_rate": 2.4926167541490185e-08, + "logits/chosen": -0.370398610830307, + "logits/rejected": -0.10767936706542969, + "logps/chosen": -3.0924620628356934, + "logps/rejected": -4.428684234619141, + "loss": 0.4472, + "rewards/accuracies": 0.831250011920929, + "rewards/chosen": -3.0924620628356934, + "rewards/margins": 1.3362222909927368, + "rewards/rejected": -4.428684234619141, + "sft_loss": 3.2309207916259766, + "step": 5095 + }, + { + "epoch": 2.7295534370296037, + "grad_norm": 12.801368025013259, + "learning_rate": 2.4442874323623574e-08, + "logits/chosen": -0.1572403907775879, + "logits/rejected": 0.01683208718895912, + "logps/chosen": -3.1529765129089355, + "logps/rejected": -4.545765399932861, + "loss": 0.4614, + "rewards/accuracies": 0.78125, + "rewards/chosen": -3.1529765129089355, + "rewards/margins": 1.3927887678146362, + "rewards/rejected": -4.545765399932861, + "sft_loss": 3.323516368865967, + "step": 5100 + }, + { + "epoch": 2.7322294698110055, + "grad_norm": 19.38537449809977, + "learning_rate": 2.396419496307589e-08, + "logits/chosen": -0.20657595992088318, + "logits/rejected": 0.0014978349208831787, + "logps/chosen": -3.275643825531006, + "logps/rejected": -4.5586090087890625, + "loss": 0.4631, + "rewards/accuracies": 0.8125, + "rewards/chosen": -3.275643825531006, + "rewards/margins": 1.2829653024673462, + "rewards/rejected": -4.5586090087890625, + "sft_loss": 3.4100615978240967, + "step": 5105 + }, + { + "epoch": 2.7349055025924067, + "grad_norm": 19.297910442054516, + "learning_rate": 2.349013410399653e-08, + "logits/chosen": -0.21427297592163086, + "logits/rejected": -0.06879232078790665, + "logps/chosen": -3.067349672317505, + "logps/rejected": -4.291683673858643, + "loss": 0.468, + "rewards/accuracies": 0.8187500238418579, + "rewards/chosen": -3.067349672317505, + "rewards/margins": 1.2243340015411377, + "rewards/rejected": -4.291683673858643, + "sft_loss": 3.214346408843994, + "step": 5110 + }, + { + "epoch": 2.7375815353738084, + "grad_norm": 15.978901754824962, + "learning_rate": 2.3020696345725954e-08, + "logits/chosen": -0.29842501878738403, + "logits/rejected": -0.017918001860380173, + "logps/chosen": -3.176945209503174, + "logps/rejected": -4.515137672424316, + "loss": 0.4066, + "rewards/accuracies": 0.84375, + "rewards/chosen": -3.176945209503174, + "rewards/margins": 1.3381928205490112, + "rewards/rejected": -4.515137672424316, + "sft_loss": 3.2655911445617676, + "step": 5115 + }, + { + "epoch": 2.7402575681552097, + "grad_norm": 25.512112118049004, + "learning_rate": 2.2555886242751398e-08, + "logits/chosen": -0.23838338255882263, + "logits/rejected": -0.1361640989780426, + "logps/chosen": -3.1773295402526855, + "logps/rejected": -4.491230010986328, + "loss": 0.4194, + "rewards/accuracies": 0.831250011920929, + "rewards/chosen": -3.1773295402526855, + "rewards/margins": 1.313900351524353, + "rewards/rejected": -4.491230010986328, + "sft_loss": 3.287815809249878, + "step": 5120 + }, + { + "epoch": 2.7429336009366114, + "grad_norm": 28.17408344751377, + "learning_rate": 2.2095708304662453e-08, + "logits/chosen": -0.35994330048561096, + "logits/rejected": -0.07787968963384628, + "logps/chosen": -3.064584255218506, + "logps/rejected": -4.313921928405762, + "loss": 0.4529, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -3.064584255218506, + "rewards/margins": 1.2493374347686768, + "rewards/rejected": -4.313921928405762, + "sft_loss": 3.270467758178711, + "step": 5125 + }, + { + "epoch": 2.745609633718013, + "grad_norm": 17.656274722525687, + "learning_rate": 2.16401669961076e-08, + "logits/chosen": -0.3636077344417572, + "logits/rejected": -0.12653307616710663, + "logps/chosen": -3.1638081073760986, + "logps/rejected": -4.486863613128662, + "loss": 0.4416, + "rewards/accuracies": 0.8125, + "rewards/chosen": -3.1638081073760986, + "rewards/margins": 1.3230552673339844, + "rewards/rejected": -4.486863613128662, + "sft_loss": 3.383385181427002, + "step": 5130 + }, + { + "epoch": 2.748285666499415, + "grad_norm": 25.407359996428223, + "learning_rate": 2.1189266736750532e-08, + "logits/chosen": -0.14153358340263367, + "logits/rejected": -0.0551319494843483, + "logps/chosen": -3.1332790851593018, + "logps/rejected": -4.269174098968506, + "loss": 0.4787, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -3.1332790851593018, + "rewards/margins": 1.135895013809204, + "rewards/rejected": -4.269174098968506, + "sft_loss": 3.3371098041534424, + "step": 5135 + }, + { + "epoch": 2.750961699280816, + "grad_norm": 17.329397758719967, + "learning_rate": 2.0743011901227623e-08, + "logits/chosen": -0.17390862107276917, + "logits/rejected": 0.027312543243169785, + "logps/chosen": -3.2330029010772705, + "logps/rejected": -4.444141387939453, + "loss": 0.4505, + "rewards/accuracies": 0.8125, + "rewards/chosen": -3.2330029010772705, + "rewards/margins": 1.2111393213272095, + "rewards/rejected": -4.444141387939453, + "sft_loss": 3.3181076049804688, + "step": 5140 + }, + { + "epoch": 2.753637732062218, + "grad_norm": 25.672394019334682, + "learning_rate": 2.030140681910508e-08, + "logits/chosen": -0.18414786458015442, + "logits/rejected": -0.015248274430632591, + "logps/chosen": -3.1611456871032715, + "logps/rejected": -4.285699844360352, + "loss": 0.5, + "rewards/accuracies": 0.75, + "rewards/chosen": -3.1611456871032715, + "rewards/margins": 1.1245542764663696, + "rewards/rejected": -4.285699844360352, + "sft_loss": 3.3388450145721436, + "step": 5145 + }, + { + "epoch": 2.756313764843619, + "grad_norm": 16.777482622717063, + "learning_rate": 1.986445577483753e-08, + "logits/chosen": -0.27730321884155273, + "logits/rejected": -0.11832135915756226, + "logps/chosen": -3.099640369415283, + "logps/rejected": -4.361514091491699, + "loss": 0.4583, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -3.099640369415283, + "rewards/margins": 1.2618744373321533, + "rewards/rejected": -4.361514091491699, + "sft_loss": 3.2630677223205566, + "step": 5150 + }, + { + "epoch": 2.758989797625021, + "grad_norm": 16.454912586585895, + "learning_rate": 1.9432163007725765e-08, + "logits/chosen": -0.29692643880844116, + "logits/rejected": -0.17871159315109253, + "logps/chosen": -3.115957260131836, + "logps/rejected": -4.290701389312744, + "loss": 0.4671, + "rewards/accuracies": 0.8187500238418579, + "rewards/chosen": -3.115957260131836, + "rewards/margins": 1.174743890762329, + "rewards/rejected": -4.290701389312744, + "sft_loss": 3.319054365158081, + "step": 5155 + }, + { + "epoch": 2.7616658304064226, + "grad_norm": 15.672889445754924, + "learning_rate": 1.9004532711876297e-08, + "logits/chosen": -0.22892217338085175, + "logits/rejected": -0.17264311015605927, + "logps/chosen": -3.000776767730713, + "logps/rejected": -4.343716144561768, + "loss": 0.4387, + "rewards/accuracies": 0.793749988079071, + "rewards/chosen": -3.000776767730713, + "rewards/margins": 1.342938780784607, + "rewards/rejected": -4.343716144561768, + "sft_loss": 3.2002334594726562, + "step": 5160 + }, + { + "epoch": 2.7643418631878243, + "grad_norm": 18.34683396965878, + "learning_rate": 1.8581569036159928e-08, + "logits/chosen": -0.2559751272201538, + "logits/rejected": -0.019560130313038826, + "logps/chosen": -3.0496087074279785, + "logps/rejected": -4.328253269195557, + "loss": 0.437, + "rewards/accuracies": 0.84375, + "rewards/chosen": -3.0496087074279785, + "rewards/margins": 1.27864408493042, + "rewards/rejected": -4.328253269195557, + "sft_loss": 3.156299114227295, + "step": 5165 + }, + { + "epoch": 2.7670178959692255, + "grad_norm": 13.638033094970202, + "learning_rate": 1.8163276084172285e-08, + "logits/chosen": -0.22779671847820282, + "logits/rejected": -0.06113610416650772, + "logps/chosen": -3.1405844688415527, + "logps/rejected": -4.442783355712891, + "loss": 0.4255, + "rewards/accuracies": 0.831250011920929, + "rewards/chosen": -3.1405844688415527, + "rewards/margins": 1.3021987676620483, + "rewards/rejected": -4.442783355712891, + "sft_loss": 3.345402479171753, + "step": 5170 + }, + { + "epoch": 2.7696939287506273, + "grad_norm": 18.255590078165085, + "learning_rate": 1.7749657914193194e-08, + "logits/chosen": -0.2028944194316864, + "logits/rejected": -0.08475472033023834, + "logps/chosen": -3.2638847827911377, + "logps/rejected": -4.632298469543457, + "loss": 0.3987, + "rewards/accuracies": 0.84375, + "rewards/chosen": -3.2638847827911377, + "rewards/margins": 1.3684141635894775, + "rewards/rejected": -4.632298469543457, + "sft_loss": 3.346186876296997, + "step": 5175 + }, + { + "epoch": 2.7723699615320285, + "grad_norm": 24.772714135947187, + "learning_rate": 1.7340718539148203e-08, + "logits/chosen": -0.16636791825294495, + "logits/rejected": -0.08039329200983047, + "logps/chosen": -3.3139376640319824, + "logps/rejected": -4.398904323577881, + "loss": 0.4981, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -3.3139376640319824, + "rewards/margins": 1.0849663019180298, + "rewards/rejected": -4.398904323577881, + "sft_loss": 3.5393283367156982, + "step": 5180 + }, + { + "epoch": 2.7750459943134302, + "grad_norm": 15.997616887752065, + "learning_rate": 1.6936461926568724e-08, + "logits/chosen": -0.17959436774253845, + "logits/rejected": 0.0007152434554882348, + "logps/chosen": -3.1098248958587646, + "logps/rejected": -4.493106365203857, + "loss": 0.4645, + "rewards/accuracies": 0.8062499761581421, + "rewards/chosen": -3.1098248958587646, + "rewards/margins": 1.3832814693450928, + "rewards/rejected": -4.493106365203857, + "sft_loss": 3.354473829269409, + "step": 5185 + }, + { + "epoch": 2.777722027094832, + "grad_norm": 19.2292266193858, + "learning_rate": 1.6536891998554346e-08, + "logits/chosen": -0.32440274953842163, + "logits/rejected": -0.11405463516712189, + "logps/chosen": -3.0503506660461426, + "logps/rejected": -4.309788227081299, + "loss": 0.4564, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -3.0503506660461426, + "rewards/margins": 1.259437918663025, + "rewards/rejected": -4.309788227081299, + "sft_loss": 3.2820911407470703, + "step": 5190 + }, + { + "epoch": 2.7803980598762337, + "grad_norm": 18.601636609926377, + "learning_rate": 1.6142012631734093e-08, + "logits/chosen": -0.1657555103302002, + "logits/rejected": 0.01971413753926754, + "logps/chosen": -3.1654350757598877, + "logps/rejected": -4.4513959884643555, + "loss": 0.4367, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -3.1654350757598877, + "rewards/margins": 1.2859606742858887, + "rewards/rejected": -4.4513959884643555, + "sft_loss": 3.287670135498047, + "step": 5195 + }, + { + "epoch": 2.783074092657635, + "grad_norm": 23.10107289402409, + "learning_rate": 1.575182765722949e-08, + "logits/chosen": -0.311906635761261, + "logits/rejected": -0.10322600603103638, + "logps/chosen": -3.0944015979766846, + "logps/rejected": -4.473546028137207, + "loss": 0.4272, + "rewards/accuracies": 0.8187500238418579, + "rewards/chosen": -3.0944015979766846, + "rewards/margins": 1.3791444301605225, + "rewards/rejected": -4.473546028137207, + "sft_loss": 3.301250457763672, + "step": 5200 + }, + { + "epoch": 2.783074092657635, + "eval_logits/chosen": 0.22810406982898712, + "eval_logits/rejected": 0.35918039083480835, + "eval_logps/chosen": -3.313793182373047, + "eval_logps/rejected": -4.3619208335876465, + "eval_loss": 0.5578520894050598, + "eval_rewards/accuracies": 0.7232937812805176, + "eval_rewards/chosen": -3.313793182373047, + "eval_rewards/margins": 1.0481278896331787, + "eval_rewards/rejected": -4.3619208335876465, + "eval_runtime": 50.946, + "eval_samples_per_second": 26.401, + "eval_sft_loss": 3.4522531032562256, + "eval_steps_per_second": 6.615, + "step": 5200 + }, + { + "epoch": 2.7857501254390367, + "grad_norm": 11.63526468440465, + "learning_rate": 1.536634086061672e-08, + "logits/chosen": -0.1776561737060547, + "logits/rejected": -0.08043881505727768, + "logps/chosen": -3.1294918060302734, + "logps/rejected": -4.408661842346191, + "loss": 0.4416, + "rewards/accuracies": 0.8125, + "rewards/chosen": -3.1294918060302734, + "rewards/margins": 1.2791701555252075, + "rewards/rejected": -4.408661842346191, + "sft_loss": 3.2111599445343018, + "step": 5205 + }, + { + "epoch": 2.788426158220438, + "grad_norm": 16.73777140478313, + "learning_rate": 1.4985555981890495e-08, + "logits/chosen": -0.20793786644935608, + "logits/rejected": -0.06591515988111496, + "logps/chosen": -3.1309242248535156, + "logps/rejected": -4.449841499328613, + "loss": 0.4562, + "rewards/accuracies": 0.8062499761581421, + "rewards/chosen": -3.1309242248535156, + "rewards/margins": 1.31891667842865, + "rewards/rejected": -4.449841499328613, + "sft_loss": 3.2599101066589355, + "step": 5210 + }, + { + "epoch": 2.7911021910018396, + "grad_norm": 16.52444894688706, + "learning_rate": 1.4609476715427226e-08, + "logits/chosen": -0.21759703755378723, + "logits/rejected": -0.0982574075460434, + "logps/chosen": -3.0120937824249268, + "logps/rejected": -4.364049434661865, + "loss": 0.4262, + "rewards/accuracies": 0.856249988079071, + "rewards/chosen": -3.0120937824249268, + "rewards/margins": 1.3519560098648071, + "rewards/rejected": -4.364049434661865, + "sft_loss": 3.1805450916290283, + "step": 5215 + }, + { + "epoch": 2.7937782237832414, + "grad_norm": 16.345210936985595, + "learning_rate": 1.4238106709949792e-08, + "logits/chosen": -0.2661355137825012, + "logits/rejected": -0.13124528527259827, + "logps/chosen": -3.1420581340789795, + "logps/rejected": -4.6199517250061035, + "loss": 0.3903, + "rewards/accuracies": 0.84375, + "rewards/chosen": -3.1420581340789795, + "rewards/margins": 1.477893590927124, + "rewards/rejected": -4.6199517250061035, + "sft_loss": 3.2643871307373047, + "step": 5220 + }, + { + "epoch": 2.796454256564643, + "grad_norm": 23.04024804031211, + "learning_rate": 1.3871449568491511e-08, + "logits/chosen": -0.20268850028514862, + "logits/rejected": 0.005442657973617315, + "logps/chosen": -3.203855514526367, + "logps/rejected": -4.447484016418457, + "loss": 0.4626, + "rewards/accuracies": 0.8187500238418579, + "rewards/chosen": -3.203855514526367, + "rewards/margins": 1.2436293363571167, + "rewards/rejected": -4.447484016418457, + "sft_loss": 3.311129331588745, + "step": 5225 + }, + { + "epoch": 2.7991302893460444, + "grad_norm": 13.165452282321871, + "learning_rate": 1.3509508848361606e-08, + "logits/chosen": -0.347541481256485, + "logits/rejected": -0.14753584563732147, + "logps/chosen": -3.1543853282928467, + "logps/rejected": -4.4243059158325195, + "loss": 0.4364, + "rewards/accuracies": 0.84375, + "rewards/chosen": -3.1543853282928467, + "rewards/margins": 1.2699207067489624, + "rewards/rejected": -4.4243059158325195, + "sft_loss": 3.2332236766815186, + "step": 5230 + }, + { + "epoch": 2.801806322127446, + "grad_norm": 15.873162098156593, + "learning_rate": 1.3152288061110517e-08, + "logits/chosen": -0.3166837692260742, + "logits/rejected": -0.1324232965707779, + "logps/chosen": -3.044175386428833, + "logps/rejected": -4.35813045501709, + "loss": 0.436, + "rewards/accuracies": 0.8125, + "rewards/chosen": -3.044175386428833, + "rewards/margins": 1.313955545425415, + "rewards/rejected": -4.35813045501709, + "sft_loss": 3.1847636699676514, + "step": 5235 + }, + { + "epoch": 2.804482354908848, + "grad_norm": 20.314089406390774, + "learning_rate": 1.2799790672495814e-08, + "logits/chosen": -0.29718679189682007, + "logits/rejected": -0.04755071923136711, + "logps/chosen": -3.2006309032440186, + "logps/rejected": -4.376867294311523, + "loss": 0.4736, + "rewards/accuracies": 0.8062499761581421, + "rewards/chosen": -3.2006309032440186, + "rewards/margins": 1.176236867904663, + "rewards/rejected": -4.376867294311523, + "sft_loss": 3.3230297565460205, + "step": 5240 + }, + { + "epoch": 2.807158387690249, + "grad_norm": 20.805364839798266, + "learning_rate": 1.2452020102448835e-08, + "logits/chosen": -0.22540588676929474, + "logits/rejected": -0.13884001970291138, + "logps/chosen": -3.12069034576416, + "logps/rejected": -4.296230792999268, + "loss": 0.4855, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -3.12069034576416, + "rewards/margins": 1.1755400896072388, + "rewards/rejected": -4.296230792999268, + "sft_loss": 3.3076324462890625, + "step": 5245 + }, + { + "epoch": 2.8098344204716508, + "grad_norm": 30.80994255066866, + "learning_rate": 1.2108979725041103e-08, + "logits/chosen": -0.32555264234542847, + "logits/rejected": -0.14052915573120117, + "logps/chosen": -3.2292141914367676, + "logps/rejected": -4.531303405761719, + "loss": 0.4532, + "rewards/accuracies": 0.8125, + "rewards/chosen": -3.2292141914367676, + "rewards/margins": 1.3020894527435303, + "rewards/rejected": -4.531303405761719, + "sft_loss": 3.4142825603485107, + "step": 5250 + }, + { + "epoch": 2.8125104532530525, + "grad_norm": 17.44712826499318, + "learning_rate": 1.1770672868451958e-08, + "logits/chosen": -0.28547942638397217, + "logits/rejected": -0.024611469358205795, + "logps/chosen": -3.3335862159729004, + "logps/rejected": -4.608405113220215, + "loss": 0.4362, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -3.3335862159729004, + "rewards/margins": 1.2748191356658936, + "rewards/rejected": -4.608405113220215, + "sft_loss": 3.3824000358581543, + "step": 5255 + }, + { + "epoch": 2.8151864860344538, + "grad_norm": 22.87907119725168, + "learning_rate": 1.1437102814935872e-08, + "logits/chosen": -0.21431437134742737, + "logits/rejected": -0.1254430115222931, + "logps/chosen": -3.172545909881592, + "logps/rejected": -4.361286640167236, + "loss": 0.4921, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -3.172545909881592, + "rewards/margins": 1.1887407302856445, + "rewards/rejected": -4.361286640167236, + "sft_loss": 3.4551422595977783, + "step": 5260 + }, + { + "epoch": 2.8178625188158555, + "grad_norm": 16.548951869023035, + "learning_rate": 1.1108272800791018e-08, + "logits/chosen": -0.34953418374061584, + "logits/rejected": -0.056724101305007935, + "logps/chosen": -3.297877073287964, + "logps/rejected": -4.5453925132751465, + "loss": 0.4618, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -3.297877073287964, + "rewards/margins": 1.2475159168243408, + "rewards/rejected": -4.5453925132751465, + "sft_loss": 3.439291477203369, + "step": 5265 + }, + { + "epoch": 2.820538551597257, + "grad_norm": 17.836969106027368, + "learning_rate": 1.078418601632769e-08, + "logits/chosen": -0.1767922341823578, + "logits/rejected": 0.0060912007465958595, + "logps/chosen": -3.233365297317505, + "logps/rejected": -4.544157981872559, + "loss": 0.4216, + "rewards/accuracies": 0.862500011920929, + "rewards/chosen": -3.233365297317505, + "rewards/margins": 1.3107929229736328, + "rewards/rejected": -4.544157981872559, + "sft_loss": 3.4093177318573, + "step": 5270 + }, + { + "epoch": 2.8232145843786585, + "grad_norm": 13.426686551628864, + "learning_rate": 1.0464845605837159e-08, + "logits/chosen": -0.21655690670013428, + "logits/rejected": -0.036606211215257645, + "logps/chosen": -3.2507522106170654, + "logps/rejected": -4.522493362426758, + "loss": 0.4176, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -3.2507522106170654, + "rewards/margins": 1.2717409133911133, + "rewards/rejected": -4.522493362426758, + "sft_loss": 3.311718702316284, + "step": 5275 + }, + { + "epoch": 2.82589061716006, + "grad_norm": 13.710812813707037, + "learning_rate": 1.0150254667561642e-08, + "logits/chosen": -0.23185142874717712, + "logits/rejected": 0.02568121626973152, + "logps/chosen": -3.3728013038635254, + "logps/rejected": -4.749502182006836, + "loss": 0.4133, + "rewards/accuracies": 0.8374999761581421, + "rewards/chosen": -3.3728013038635254, + "rewards/margins": 1.376700758934021, + "rewards/rejected": -4.749502182006836, + "sft_loss": 3.4171886444091797, + "step": 5280 + }, + { + "epoch": 2.828566649941462, + "grad_norm": 24.189221868553442, + "learning_rate": 9.840416253663719e-09, + "logits/chosen": -0.29940885305404663, + "logits/rejected": -0.14495554566383362, + "logps/chosen": -3.181389331817627, + "logps/rejected": -4.61637544631958, + "loss": 0.4307, + "rewards/accuracies": 0.831250011920929, + "rewards/chosen": -3.181389331817627, + "rewards/margins": 1.434985637664795, + "rewards/rejected": -4.61637544631958, + "sft_loss": 3.3046677112579346, + "step": 5285 + }, + { + "epoch": 2.8312426827228636, + "grad_norm": 16.26694146579324, + "learning_rate": 9.535333370197074e-09, + "logits/chosen": -0.2290172129869461, + "logits/rejected": -0.030090123414993286, + "logps/chosen": -3.1338648796081543, + "logps/rejected": -4.436646461486816, + "loss": 0.4332, + "rewards/accuracies": 0.84375, + "rewards/chosen": -3.1338648796081543, + "rewards/margins": 1.3027812242507935, + "rewards/rejected": -4.436646461486816, + "sft_loss": 3.3667190074920654, + "step": 5290 + }, + { + "epoch": 2.833918715504265, + "grad_norm": 17.591562684197445, + "learning_rate": 9.23500897707713e-09, + "logits/chosen": -0.3069230914115906, + "logits/rejected": -0.06996993720531464, + "logps/chosen": -3.3611602783203125, + "logps/rejected": -4.745880603790283, + "loss": 0.431, + "rewards/accuracies": 0.8062499761581421, + "rewards/chosen": -3.3611602783203125, + "rewards/margins": 1.3847198486328125, + "rewards/rejected": -4.745880603790283, + "sft_loss": 3.4696857929229736, + "step": 5295 + }, + { + "epoch": 2.8365947482856666, + "grad_norm": 18.556496185207415, + "learning_rate": 8.939445988052574e-09, + "logits/chosen": -0.23846478760242462, + "logits/rejected": -0.1405213624238968, + "logps/chosen": -3.142049551010132, + "logps/rejected": -4.500870704650879, + "loss": 0.4318, + "rewards/accuracies": 0.8187500238418579, + "rewards/chosen": -3.142049551010132, + "rewards/margins": 1.3588216304779053, + "rewards/rejected": -4.500870704650879, + "sft_loss": 3.2427544593811035, + "step": 5300 + }, + { + "epoch": 2.839270781067068, + "grad_norm": 34.73071470891618, + "learning_rate": 8.648647270676656e-09, + "logits/chosen": -0.2352648228406906, + "logits/rejected": -0.0805886760354042, + "logps/chosen": -3.24641489982605, + "logps/rejected": -4.425256729125977, + "loss": 0.4864, + "rewards/accuracies": 0.78125, + "rewards/chosen": -3.24641489982605, + "rewards/margins": 1.1788415908813477, + "rewards/rejected": -4.425256729125977, + "sft_loss": 3.4514222145080566, + "step": 5305 + }, + { + "epoch": 2.8419468138484696, + "grad_norm": 16.060659457953548, + "learning_rate": 8.362615646279991e-09, + "logits/chosen": -0.393909752368927, + "logits/rejected": -0.10782015323638916, + "logps/chosen": -3.2175469398498535, + "logps/rejected": -4.7070841789245605, + "loss": 0.4525, + "rewards/accuracies": 0.8125, + "rewards/chosen": -3.2175469398498535, + "rewards/margins": 1.4895371198654175, + "rewards/rejected": -4.7070841789245605, + "sft_loss": 3.373231887817383, + "step": 5310 + }, + { + "epoch": 2.8446228466298713, + "grad_norm": 24.592200703808505, + "learning_rate": 8.081353889942466e-09, + "logits/chosen": -0.17484773695468903, + "logits/rejected": 0.05669660493731499, + "logps/chosen": -3.2570137977600098, + "logps/rejected": -4.378922462463379, + "loss": 0.4675, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -3.2570137977600098, + "rewards/margins": 1.1219086647033691, + "rewards/rejected": -4.378922462463379, + "sft_loss": 3.40653920173645, + "step": 5315 + }, + { + "epoch": 2.847298879411273, + "grad_norm": 23.247955885169546, + "learning_rate": 7.804864730467042e-09, + "logits/chosen": -0.1862056851387024, + "logits/rejected": -0.10346100479364395, + "logps/chosen": -3.1827163696289062, + "logps/rejected": -4.400063991546631, + "loss": 0.4369, + "rewards/accuracies": 0.8125, + "rewards/chosen": -3.1827163696289062, + "rewards/margins": 1.2173478603363037, + "rewards/rejected": -4.400063991546631, + "sft_loss": 3.239983320236206, + "step": 5320 + }, + { + "epoch": 2.8499749121926743, + "grad_norm": 13.975924625354944, + "learning_rate": 7.533150850352665e-09, + "logits/chosen": -0.23412814736366272, + "logits/rejected": -0.014413821510970592, + "logps/chosen": -3.18890118598938, + "logps/rejected": -4.628230094909668, + "loss": 0.397, + "rewards/accuracies": 0.84375, + "rewards/chosen": -3.18890118598938, + "rewards/margins": 1.439328908920288, + "rewards/rejected": -4.628230094909668, + "sft_loss": 3.325713634490967, + "step": 5325 + }, + { + "epoch": 2.852650944974076, + "grad_norm": 20.02004348976186, + "learning_rate": 7.2662148857686175e-09, + "logits/chosen": -0.17493878304958344, + "logits/rejected": -0.05669688060879707, + "logps/chosen": -3.1119914054870605, + "logps/rejected": -4.453869819641113, + "loss": 0.4648, + "rewards/accuracies": 0.8062499761581421, + "rewards/chosen": -3.1119914054870605, + "rewards/margins": 1.3418781757354736, + "rewards/rejected": -4.453869819641113, + "sft_loss": 3.3459744453430176, + "step": 5330 + }, + { + "epoch": 2.8553269777554773, + "grad_norm": 17.006423751521428, + "learning_rate": 7.0040594265287635e-09, + "logits/chosen": -0.13383543491363525, + "logits/rejected": -0.18039759993553162, + "logps/chosen": -3.1353771686553955, + "logps/rejected": -4.129621982574463, + "loss": 0.5205, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": -3.1353771686553955, + "rewards/margins": 0.994244396686554, + "rewards/rejected": -4.129621982574463, + "sft_loss": 3.2968032360076904, + "step": 5335 + }, + { + "epoch": 2.858003010536879, + "grad_norm": 15.672222239707732, + "learning_rate": 6.746687016066566e-09, + "logits/chosen": -0.18188676238059998, + "logits/rejected": -0.09335924685001373, + "logps/chosen": -3.066502332687378, + "logps/rejected": -4.382439136505127, + "loss": 0.4402, + "rewards/accuracies": 0.8125, + "rewards/chosen": -3.066502332687378, + "rewards/margins": 1.3159363269805908, + "rewards/rejected": -4.382439136505127, + "sft_loss": 3.1553988456726074, + "step": 5340 + }, + { + "epoch": 2.8606790433182807, + "grad_norm": 19.99708773348044, + "learning_rate": 6.494100151410276e-09, + "logits/chosen": -0.3316792845726013, + "logits/rejected": -0.09101025760173798, + "logps/chosen": -3.124213218688965, + "logps/rejected": -4.363964557647705, + "loss": 0.4457, + "rewards/accuracies": 0.8187500238418579, + "rewards/chosen": -3.124213218688965, + "rewards/margins": 1.2397515773773193, + "rewards/rejected": -4.363964557647705, + "sft_loss": 3.3296375274658203, + "step": 5345 + }, + { + "epoch": 2.8633550760996824, + "grad_norm": 26.350892057682206, + "learning_rate": 6.246301283158728e-09, + "logits/chosen": -0.15335838496685028, + "logits/rejected": -0.14175648987293243, + "logps/chosen": -3.2014992237091064, + "logps/rejected": -4.230733394622803, + "loss": 0.5284, + "rewards/accuracies": 0.78125, + "rewards/chosen": -3.2014992237091064, + "rewards/margins": 1.0292346477508545, + "rewards/rejected": -4.230733394622803, + "sft_loss": 3.289850950241089, + "step": 5350 + }, + { + "epoch": 2.8660311088810837, + "grad_norm": 17.43689301985789, + "learning_rate": 6.0032928154576944e-09, + "logits/chosen": -0.2279408723115921, + "logits/rejected": -0.13653866946697235, + "logps/chosen": -3.2177162170410156, + "logps/rejected": -4.31894588470459, + "loss": 0.485, + "rewards/accuracies": 0.768750011920929, + "rewards/chosen": -3.2177162170410156, + "rewards/margins": 1.1012293100357056, + "rewards/rejected": -4.31894588470459, + "sft_loss": 3.341726779937744, + "step": 5355 + }, + { + "epoch": 2.8687071416624854, + "grad_norm": 19.895315726239694, + "learning_rate": 5.76507710597629e-09, + "logits/chosen": -0.29223036766052246, + "logits/rejected": -0.05113974213600159, + "logps/chosen": -3.1430892944335938, + "logps/rejected": -4.285546779632568, + "loss": 0.4814, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -3.1430892944335938, + "rewards/margins": 1.1424576044082642, + "rewards/rejected": -4.285546779632568, + "sft_loss": 3.3228225708007812, + "step": 5360 + }, + { + "epoch": 2.8713831744438867, + "grad_norm": 14.092836229431335, + "learning_rate": 5.531656465884438e-09, + "logits/chosen": -0.2947445809841156, + "logits/rejected": -0.10095179080963135, + "logps/chosen": -3.106201171875, + "logps/rejected": -4.485922813415527, + "loss": 0.4225, + "rewards/accuracies": 0.84375, + "rewards/chosen": -3.106201171875, + "rewards/margins": 1.3797216415405273, + "rewards/rejected": -4.485922813415527, + "sft_loss": 3.2518115043640137, + "step": 5365 + }, + { + "epoch": 2.8740592072252884, + "grad_norm": 18.82181749076804, + "learning_rate": 5.303033159830217e-09, + "logits/chosen": -0.11835135519504547, + "logits/rejected": -0.06067631393671036, + "logps/chosen": -3.220038652420044, + "logps/rejected": -4.250067710876465, + "loss": 0.5098, + "rewards/accuracies": 0.78125, + "rewards/chosen": -3.220038652420044, + "rewards/margins": 1.0300289392471313, + "rewards/rejected": -4.250067710876465, + "sft_loss": 3.4482359886169434, + "step": 5370 + }, + { + "epoch": 2.87673524000669, + "grad_norm": 18.342939790515526, + "learning_rate": 5.079209405917939e-09, + "logits/chosen": -0.20136015117168427, + "logits/rejected": -0.07960865646600723, + "logps/chosen": -3.0695133209228516, + "logps/rejected": -4.61713981628418, + "loss": 0.4149, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -3.0695133209228516, + "rewards/margins": 1.5476267337799072, + "rewards/rejected": -4.61713981628418, + "sft_loss": 3.305483341217041, + "step": 5375 + }, + { + "epoch": 2.879411272788092, + "grad_norm": 21.89902867052027, + "learning_rate": 4.860187375686664e-09, + "logits/chosen": -0.3148624300956726, + "logits/rejected": -0.014219949953258038, + "logps/chosen": -3.2581634521484375, + "logps/rejected": -4.58234977722168, + "loss": 0.4281, + "rewards/accuracies": 0.831250011920929, + "rewards/chosen": -3.2581634521484375, + "rewards/margins": 1.3241863250732422, + "rewards/rejected": -4.58234977722168, + "sft_loss": 3.4043948650360107, + "step": 5380 + }, + { + "epoch": 2.882087305569493, + "grad_norm": 17.55555860238155, + "learning_rate": 4.64596919408905e-09, + "logits/chosen": -0.16654792428016663, + "logits/rejected": -0.05108920484781265, + "logps/chosen": -3.102210521697998, + "logps/rejected": -4.339724063873291, + "loss": 0.4365, + "rewards/accuracies": 0.831250011920929, + "rewards/chosen": -3.102210521697998, + "rewards/margins": 1.237513542175293, + "rewards/rejected": -4.339724063873291, + "sft_loss": 3.348740816116333, + "step": 5385 + }, + { + "epoch": 2.884763338350895, + "grad_norm": 16.612033893445844, + "learning_rate": 4.436556939470814e-09, + "logits/chosen": -0.23238448798656464, + "logits/rejected": -0.030825773254036903, + "logps/chosen": -3.314937114715576, + "logps/rejected": -4.333499431610107, + "loss": 0.5111, + "rewards/accuracies": 0.75, + "rewards/chosen": -3.314937114715576, + "rewards/margins": 1.0185620784759521, + "rewards/rejected": -4.333499431610107, + "sft_loss": 3.47926664352417, + "step": 5390 + }, + { + "epoch": 2.887439371132296, + "grad_norm": 15.359683404778131, + "learning_rate": 4.23195264355064e-09, + "logits/chosen": -0.4084858000278473, + "logits/rejected": -0.1296214759349823, + "logps/chosen": -3.121598720550537, + "logps/rejected": -4.403274059295654, + "loss": 0.4311, + "rewards/accuracies": 0.8125, + "rewards/chosen": -3.121598720550537, + "rewards/margins": 1.281675934791565, + "rewards/rejected": -4.403274059295654, + "sft_loss": 3.3122265338897705, + "step": 5395 + }, + { + "epoch": 2.890115403913698, + "grad_norm": 18.92062612189072, + "learning_rate": 4.032158291400245e-09, + "logits/chosen": -0.30001047253608704, + "logits/rejected": 0.011221880093216896, + "logps/chosen": -3.0287575721740723, + "logps/rejected": -4.610257625579834, + "loss": 0.3946, + "rewards/accuracies": 0.8374999761581421, + "rewards/chosen": -3.0287575721740723, + "rewards/margins": 1.5815006494522095, + "rewards/rejected": -4.610257625579834, + "sft_loss": 3.1205813884735107, + "step": 5400 + }, + { + "epoch": 2.8927914366950995, + "grad_norm": 17.423505608686366, + "learning_rate": 3.837175821425398e-09, + "logits/chosen": -0.18389992415905, + "logits/rejected": -0.10028437525033951, + "logps/chosen": -3.2295024394989014, + "logps/rejected": -4.395108222961426, + "loss": 0.4898, + "rewards/accuracies": 0.78125, + "rewards/chosen": -3.2295024394989014, + "rewards/margins": 1.1656053066253662, + "rewards/rejected": -4.395108222961426, + "sft_loss": 3.303095579147339, + "step": 5405 + }, + { + "epoch": 2.8954674694765012, + "grad_norm": 13.317687293815423, + "learning_rate": 3.6470071253467683e-09, + "logits/chosen": -0.25252842903137207, + "logits/rejected": -0.0916924774646759, + "logps/chosen": -3.2524707317352295, + "logps/rejected": -4.713873863220215, + "loss": 0.4554, + "rewards/accuracies": 0.8062499761581421, + "rewards/chosen": -3.2524707317352295, + "rewards/margins": 1.4614031314849854, + "rewards/rejected": -4.713873863220215, + "sft_loss": 3.4252593517303467, + "step": 5410 + }, + { + "epoch": 2.8981435022579025, + "grad_norm": 12.30414809863451, + "learning_rate": 3.461654048181939e-09, + "logits/chosen": -0.27532312273979187, + "logits/rejected": -0.017485082149505615, + "logps/chosen": -3.317967176437378, + "logps/rejected": -4.423308372497559, + "loss": 0.4897, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -3.317967176437378, + "rewards/margins": 1.1053411960601807, + "rewards/rejected": -4.423308372497559, + "sft_loss": 3.5379645824432373, + "step": 5415 + }, + { + "epoch": 2.9008195350393042, + "grad_norm": 16.426245227176164, + "learning_rate": 3.281118388227255e-09, + "logits/chosen": -0.19364751875400543, + "logits/rejected": -0.11015711724758148, + "logps/chosen": -3.285496234893799, + "logps/rejected": -4.39368200302124, + "loss": 0.5159, + "rewards/accuracies": 0.768750011920929, + "rewards/chosen": -3.285496234893799, + "rewards/margins": 1.1081856489181519, + "rewards/rejected": -4.39368200302124, + "sft_loss": 3.4461159706115723, + "step": 5420 + }, + { + "epoch": 2.903495567820706, + "grad_norm": 18.407673201533754, + "learning_rate": 3.1054018970405048e-09, + "logits/chosen": -0.20994678139686584, + "logits/rejected": -0.03689366206526756, + "logps/chosen": -3.2366814613342285, + "logps/rejected": -4.640575885772705, + "loss": 0.4153, + "rewards/accuracies": 0.8187500238418579, + "rewards/chosen": -3.2366814613342285, + "rewards/margins": 1.4038949012756348, + "rewards/rejected": -4.640575885772705, + "sft_loss": 3.4189772605895996, + "step": 5425 + }, + { + "epoch": 2.906171600602107, + "grad_norm": 16.73506873996829, + "learning_rate": 2.9345062794238207e-09, + "logits/chosen": -0.28392454981803894, + "logits/rejected": -0.05108907073736191, + "logps/chosen": -3.1887130737304688, + "logps/rejected": -4.554531574249268, + "loss": 0.4022, + "rewards/accuracies": 0.84375, + "rewards/chosen": -3.1887130737304688, + "rewards/margins": 1.365817904472351, + "rewards/rejected": -4.554531574249268, + "sft_loss": 3.3268024921417236, + "step": 5430 + }, + { + "epoch": 2.908847633383509, + "grad_norm": 19.505222114816558, + "learning_rate": 2.7684331934072492e-09, + "logits/chosen": -0.3449562191963196, + "logits/rejected": -0.2275485098361969, + "logps/chosen": -3.114872694015503, + "logps/rejected": -4.464018821716309, + "loss": 0.4348, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -3.114872694015503, + "rewards/margins": 1.3491462469100952, + "rewards/rejected": -4.464018821716309, + "sft_loss": 3.3187732696533203, + "step": 5435 + }, + { + "epoch": 2.9115236661649107, + "grad_norm": 14.20180113281798, + "learning_rate": 2.6071842502326526e-09, + "logits/chosen": -0.30079519748687744, + "logits/rejected": -0.11716896295547485, + "logps/chosen": -3.1549365520477295, + "logps/rejected": -4.319884777069092, + "loss": 0.4552, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -3.1549365520477295, + "rewards/margins": 1.1649483442306519, + "rewards/rejected": -4.319884777069092, + "sft_loss": 3.3109402656555176, + "step": 5440 + }, + { + "epoch": 2.9141996989463124, + "grad_norm": 21.58427056227777, + "learning_rate": 2.450761014337888e-09, + "logits/chosen": -0.08962593972682953, + "logits/rejected": -0.01259320043027401, + "logps/chosen": -3.1360087394714355, + "logps/rejected": -4.532161712646484, + "loss": 0.4774, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -3.1360087394714355, + "rewards/margins": 1.3961527347564697, + "rewards/rejected": -4.532161712646484, + "sft_loss": 3.270613193511963, + "step": 5445 + }, + { + "epoch": 2.9168757317277136, + "grad_norm": 21.42168683506041, + "learning_rate": 2.299165003341985e-09, + "logits/chosen": -0.07372574508190155, + "logits/rejected": 0.03359115868806839, + "logps/chosen": -3.2150301933288574, + "logps/rejected": -4.484751224517822, + "loss": 0.4444, + "rewards/accuracies": 0.8374999761581421, + "rewards/chosen": -3.2150301933288574, + "rewards/margins": 1.2697209119796753, + "rewards/rejected": -4.484751224517822, + "sft_loss": 3.3405425548553467, + "step": 5450 + }, + { + "epoch": 2.9195517645091154, + "grad_norm": 17.151872710305522, + "learning_rate": 2.1523976880299945e-09, + "logits/chosen": -0.28902652859687805, + "logits/rejected": -0.07120291888713837, + "logps/chosen": -3.227856397628784, + "logps/rejected": -4.356668472290039, + "loss": 0.487, + "rewards/accuracies": 0.793749988079071, + "rewards/chosen": -3.227856397628784, + "rewards/margins": 1.128812551498413, + "rewards/rejected": -4.356668472290039, + "sft_loss": 3.3441262245178223, + "step": 5455 + }, + { + "epoch": 2.9222277972905166, + "grad_norm": 13.587709270053116, + "learning_rate": 2.010460492339161e-09, + "logits/chosen": -0.2607978880405426, + "logits/rejected": -0.06789745390415192, + "logps/chosen": -3.0319695472717285, + "logps/rejected": -4.351911544799805, + "loss": 0.4565, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -3.0319695472717285, + "rewards/margins": 1.319941759109497, + "rewards/rejected": -4.351911544799805, + "sft_loss": 3.221714735031128, + "step": 5460 + }, + { + "epoch": 2.9249038300719183, + "grad_norm": 12.862169052333378, + "learning_rate": 1.8733547933446614e-09, + "logits/chosen": -0.29628288745880127, + "logits/rejected": -0.019387567415833473, + "logps/chosen": -3.3344168663024902, + "logps/rejected": -4.406271934509277, + "loss": 0.495, + "rewards/accuracies": 0.768750011920929, + "rewards/chosen": -3.3344168663024902, + "rewards/margins": 1.071855068206787, + "rewards/rejected": -4.406271934509277, + "sft_loss": 3.375487804412842, + "step": 5465 + }, + { + "epoch": 2.92757986285332, + "grad_norm": 30.64073459683912, + "learning_rate": 1.7410819212467231e-09, + "logits/chosen": -0.21239089965820312, + "logits/rejected": -0.10766670852899551, + "logps/chosen": -3.2013115882873535, + "logps/rejected": -4.312717914581299, + "loss": 0.4911, + "rewards/accuracies": 0.8062499761581421, + "rewards/chosen": -3.2013115882873535, + "rewards/margins": 1.1114060878753662, + "rewards/rejected": -4.312717914581299, + "sft_loss": 3.416760206222534, + "step": 5470 + }, + { + "epoch": 2.9302558956347218, + "grad_norm": 16.489737610950392, + "learning_rate": 1.613643159357192e-09, + "logits/chosen": -0.18608808517456055, + "logits/rejected": -0.2229352444410324, + "logps/chosen": -3.1164584159851074, + "logps/rejected": -4.203319549560547, + "loss": 0.4869, + "rewards/accuracies": 0.768750011920929, + "rewards/chosen": -3.1164584159851074, + "rewards/margins": 1.0868608951568604, + "rewards/rejected": -4.203319549560547, + "sft_loss": 3.298535108566284, + "step": 5475 + }, + { + "epoch": 2.932931928416123, + "grad_norm": 18.91438897778467, + "learning_rate": 1.4910397440875967e-09, + "logits/chosen": -0.2308618724346161, + "logits/rejected": -0.06928835809230804, + "logps/chosen": -3.2458624839782715, + "logps/rejected": -4.498655796051025, + "loss": 0.4807, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -3.2458624839782715, + "rewards/margins": 1.2527928352355957, + "rewards/rejected": -4.498655796051025, + "sft_loss": 3.3884501457214355, + "step": 5480 + }, + { + "epoch": 2.9356079611975248, + "grad_norm": 19.68856849840769, + "learning_rate": 1.3732728649368253e-09, + "logits/chosen": -0.18805158138275146, + "logits/rejected": 0.04152602702379227, + "logps/chosen": -3.037163019180298, + "logps/rejected": -4.186461448669434, + "loss": 0.4366, + "rewards/accuracies": 0.856249988079071, + "rewards/chosen": -3.037163019180298, + "rewards/margins": 1.149298906326294, + "rewards/rejected": -4.186461448669434, + "sft_loss": 3.1664881706237793, + "step": 5485 + }, + { + "epoch": 2.938283993978926, + "grad_norm": 21.89724872026843, + "learning_rate": 1.260343664479524e-09, + "logits/chosen": -0.23621201515197754, + "logits/rejected": -0.15477347373962402, + "logps/chosen": -3.1175694465637207, + "logps/rejected": -4.36012601852417, + "loss": 0.4541, + "rewards/accuracies": 0.793749988079071, + "rewards/chosen": -3.1175694465637207, + "rewards/margins": 1.2425566911697388, + "rewards/rejected": -4.36012601852417, + "sft_loss": 3.3829116821289062, + "step": 5490 + }, + { + "epoch": 2.9409600267603278, + "grad_norm": 15.19303404485659, + "learning_rate": 1.1522532383554384e-09, + "logits/chosen": -0.297503262758255, + "logits/rejected": -0.03792861849069595, + "logps/chosen": -3.0819716453552246, + "logps/rejected": -4.551427841186523, + "loss": 0.397, + "rewards/accuracies": 0.862500011920929, + "rewards/chosen": -3.0819716453552246, + "rewards/margins": 1.4694559574127197, + "rewards/rejected": -4.551427841186523, + "sft_loss": 3.3261775970458984, + "step": 5495 + }, + { + "epoch": 2.9436360595417295, + "grad_norm": 14.193725945698466, + "learning_rate": 1.049002635258256e-09, + "logits/chosen": -0.1557658463716507, + "logits/rejected": -0.01763027533888817, + "logps/chosen": -3.2304654121398926, + "logps/rejected": -4.36229944229126, + "loss": 0.477, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -3.2304654121398926, + "rewards/margins": 1.1318340301513672, + "rewards/rejected": -4.36229944229126, + "sft_loss": 3.355865001678467, + "step": 5500 + }, + { + "epoch": 2.946312092323131, + "grad_norm": 20.967168324555896, + "learning_rate": 9.505928569258358e-10, + "logits/chosen": -0.17183566093444824, + "logits/rejected": -0.13018515706062317, + "logps/chosen": -3.193500280380249, + "logps/rejected": -4.40403413772583, + "loss": 0.4508, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -3.193500280380249, + "rewards/margins": 1.210533618927002, + "rewards/rejected": -4.40403413772583, + "sft_loss": 3.395143985748291, + "step": 5505 + }, + { + "epoch": 2.9489881251045325, + "grad_norm": 16.65492714187424, + "learning_rate": 8.57024858130273e-10, + "logits/chosen": -0.2834213376045227, + "logits/rejected": -0.08938617259263992, + "logps/chosen": -3.1837801933288574, + "logps/rejected": -4.70327091217041, + "loss": 0.4322, + "rewards/accuracies": 0.8187500238418579, + "rewards/chosen": -3.1837801933288574, + "rewards/margins": 1.5194900035858154, + "rewards/rejected": -4.70327091217041, + "sft_loss": 3.2524771690368652, + "step": 5510 + }, + { + "epoch": 2.951664157885934, + "grad_norm": 15.621599775505928, + "learning_rate": 7.682995466686826e-10, + "logits/chosen": -0.3269936442375183, + "logits/rejected": -0.14027708768844604, + "logps/chosen": -3.178954601287842, + "logps/rejected": -4.473852634429932, + "loss": 0.4562, + "rewards/accuracies": 0.793749988079071, + "rewards/chosen": -3.178954601287842, + "rewards/margins": 1.2948976755142212, + "rewards/rejected": -4.473852634429932, + "sft_loss": 3.4053070545196533, + "step": 5515 + }, + { + "epoch": 2.9543401906673354, + "grad_norm": 18.20048266619669, + "learning_rate": 6.844177833543741e-10, + "logits/chosen": -0.2227473258972168, + "logits/rejected": -0.12284767627716064, + "logps/chosen": -3.1623284816741943, + "logps/rejected": -4.347644329071045, + "loss": 0.4639, + "rewards/accuracies": 0.793749988079071, + "rewards/chosen": -3.1623284816741943, + "rewards/margins": 1.1853158473968506, + "rewards/rejected": -4.347644329071045, + "sft_loss": 3.2885944843292236, + "step": 5520 + }, + { + "epoch": 2.957016223448737, + "grad_norm": 20.387338307524058, + "learning_rate": 6.053803820087467e-10, + "logits/chosen": -0.21222662925720215, + "logits/rejected": -0.028198879212141037, + "logps/chosen": -3.389390230178833, + "logps/rejected": -4.714000225067139, + "loss": 0.4668, + "rewards/accuracies": 0.8062499761581421, + "rewards/chosen": -3.389390230178833, + "rewards/margins": 1.3246099948883057, + "rewards/rejected": -4.714000225067139, + "sft_loss": 3.579840898513794, + "step": 5525 + }, + { + "epoch": 2.959692256230139, + "grad_norm": 15.261340925699978, + "learning_rate": 5.311881094528514e-10, + "logits/chosen": -0.33066612482070923, + "logits/rejected": -0.05994957685470581, + "logps/chosen": -3.28670072555542, + "logps/rejected": -4.361258029937744, + "loss": 0.4708, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -3.28670072555542, + "rewards/margins": 1.0745580196380615, + "rewards/rejected": -4.361258029937744, + "sft_loss": 3.3876736164093018, + "step": 5530 + }, + { + "epoch": 2.9623682890115406, + "grad_norm": 22.93299851088539, + "learning_rate": 4.6184168550050806e-10, + "logits/chosen": -0.2684002220630646, + "logits/rejected": -0.19519592821598053, + "logps/chosen": -3.2278761863708496, + "logps/rejected": -4.352793216705322, + "loss": 0.5092, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -3.2278761863708496, + "rewards/margins": 1.1249163150787354, + "rewards/rejected": -4.352793216705322, + "sft_loss": 3.4347176551818848, + "step": 5535 + }, + { + "epoch": 2.965044321792942, + "grad_norm": 19.22349114083425, + "learning_rate": 3.973417829510328e-10, + "logits/chosen": -0.3656173050403595, + "logits/rejected": -0.20642141997814178, + "logps/chosen": -3.1979427337646484, + "logps/rejected": -4.361952304840088, + "loss": 0.4841, + "rewards/accuracies": 0.8187500238418579, + "rewards/chosen": -3.1979427337646484, + "rewards/margins": 1.1640093326568604, + "rewards/rejected": -4.361952304840088, + "sft_loss": 3.2818546295166016, + "step": 5540 + }, + { + "epoch": 2.9677203545743436, + "grad_norm": 23.038096136486516, + "learning_rate": 3.3768902758274377e-10, + "logits/chosen": -0.2491336315870285, + "logits/rejected": -0.10685531795024872, + "logps/chosen": -3.1194095611572266, + "logps/rejected": -4.261729717254639, + "loss": 0.4646, + "rewards/accuracies": 0.8125, + "rewards/chosen": -3.1194095611572266, + "rewards/margins": 1.142319917678833, + "rewards/rejected": -4.261729717254639, + "sft_loss": 3.1879723072052, + "step": 5545 + }, + { + "epoch": 2.970396387355745, + "grad_norm": 12.739377033329509, + "learning_rate": 2.8288399814691e-10, + "logits/chosen": -0.15858404338359833, + "logits/rejected": -0.0856924057006836, + "logps/chosen": -3.145125150680542, + "logps/rejected": -4.276131629943848, + "loss": 0.4553, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -3.145125150680542, + "rewards/margins": 1.1310064792633057, + "rewards/rejected": -4.276131629943848, + "sft_loss": 3.2625198364257812, + "step": 5550 + }, + { + "epoch": 2.9730724201371466, + "grad_norm": 20.870769093388706, + "learning_rate": 2.3292722636220066e-10, + "logits/chosen": -0.269282728433609, + "logits/rejected": -0.041368477046489716, + "logps/chosen": -3.202881336212158, + "logps/rejected": -4.5406646728515625, + "loss": 0.4462, + "rewards/accuracies": 0.78125, + "rewards/chosen": -3.202881336212158, + "rewards/margins": 1.3377827405929565, + "rewards/rejected": -4.5406646728515625, + "sft_loss": 3.2747268676757812, + "step": 5555 + }, + { + "epoch": 2.9757484529185483, + "grad_norm": 19.267796921776338, + "learning_rate": 1.8781919690946668e-10, + "logits/chosen": -0.1855693757534027, + "logits/rejected": -0.13502904772758484, + "logps/chosen": -3.1918249130249023, + "logps/rejected": -4.2519426345825195, + "loss": 0.5034, + "rewards/accuracies": 0.78125, + "rewards/chosen": -3.1918249130249023, + "rewards/margins": 1.060117483139038, + "rewards/rejected": -4.2519426345825195, + "sft_loss": 3.400783061981201, + "step": 5560 + }, + { + "epoch": 2.97842448569995, + "grad_norm": 20.50863505394151, + "learning_rate": 1.4756034742696711e-10, + "logits/chosen": -0.2911008596420288, + "logits/rejected": -0.1451537311077118, + "logps/chosen": -3.2650890350341797, + "logps/rejected": -4.429064750671387, + "loss": 0.5037, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -3.2650890350341797, + "rewards/margins": 1.163975477218628, + "rewards/rejected": -4.429064750671387, + "sft_loss": 3.4079482555389404, + "step": 5565 + }, + { + "epoch": 2.9811005184813513, + "grad_norm": 15.847220799145843, + "learning_rate": 1.12151068506261e-10, + "logits/chosen": -0.2483815848827362, + "logits/rejected": -0.07205311954021454, + "logps/chosen": -3.121838092803955, + "logps/rejected": -4.662284851074219, + "loss": 0.4266, + "rewards/accuracies": 0.84375, + "rewards/chosen": -3.121838092803955, + "rewards/margins": 1.5404466390609741, + "rewards/rejected": -4.662284851074219, + "sft_loss": 3.2707436084747314, + "step": 5570 + }, + { + "epoch": 2.983776551262753, + "grad_norm": 17.09495394877967, + "learning_rate": 8.159170368826629e-11, + "logits/chosen": -0.261943519115448, + "logits/rejected": -0.061380576342344284, + "logps/chosen": -2.987300395965576, + "logps/rejected": -4.289914131164551, + "loss": 0.4827, + "rewards/accuracies": 0.78125, + "rewards/chosen": -2.987300395965576, + "rewards/margins": 1.3026129007339478, + "rewards/rejected": -4.289914131164551, + "sft_loss": 3.1605923175811768, + "step": 5575 + }, + { + "epoch": 2.9864525840441547, + "grad_norm": 23.620417611688563, + "learning_rate": 5.588254946015114e-11, + "logits/chosen": -0.3461746275424957, + "logits/rejected": -0.020816374570131302, + "logps/chosen": -3.119680166244507, + "logps/rejected": -4.386064529418945, + "loss": 0.4468, + "rewards/accuracies": 0.8187500238418579, + "rewards/chosen": -3.119680166244507, + "rewards/margins": 1.2663848400115967, + "rewards/rejected": -4.386064529418945, + "sft_loss": 3.278569459915161, + "step": 5580 + }, + { + "epoch": 2.989128616825556, + "grad_norm": 14.07239124920289, + "learning_rate": 3.502385525216978e-11, + "logits/chosen": -0.3145311772823334, + "logits/rejected": -0.09763443470001221, + "logps/chosen": -3.109294891357422, + "logps/rejected": -4.4843902587890625, + "loss": 0.4082, + "rewards/accuracies": 0.8374999761581421, + "rewards/chosen": -3.109294891357422, + "rewards/margins": 1.375095009803772, + "rewards/rejected": -4.4843902587890625, + "sft_loss": 3.376676559448242, + "step": 5585 + }, + { + "epoch": 2.9918046496069577, + "grad_norm": 18.215483819237175, + "learning_rate": 1.901582343555308e-11, + "logits/chosen": -0.21330972015857697, + "logits/rejected": -0.11640063673257828, + "logps/chosen": -3.2996044158935547, + "logps/rejected": -4.525092124938965, + "loss": 0.4692, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -3.2996044158935547, + "rewards/margins": 1.225487470626831, + "rewards/rejected": -4.525092124938965, + "sft_loss": 3.3841681480407715, + "step": 5590 + }, + { + "epoch": 2.9944806823883594, + "grad_norm": 25.746389433320907, + "learning_rate": 7.858609320232634e-12, + "logits/chosen": -0.2573150098323822, + "logits/rejected": -0.025281842797994614, + "logps/chosen": -3.1486496925354004, + "logps/rejected": -4.428877353668213, + "loss": 0.445, + "rewards/accuracies": 0.831250011920929, + "rewards/chosen": -3.1486496925354004, + "rewards/margins": 1.2802274227142334, + "rewards/rejected": -4.428877353668213, + "sft_loss": 3.344500780105591, + "step": 5595 + }, + { + "epoch": 2.9971567151697607, + "grad_norm": 18.07974208741398, + "learning_rate": 1.5523211535639624e-12, + "logits/chosen": -0.26546934247016907, + "logits/rejected": -0.10036937892436981, + "logps/chosen": -3.194868564605713, + "logps/rejected": -4.7231011390686035, + "loss": 0.459, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -3.194868564605713, + "rewards/margins": 1.528232216835022, + "rewards/rejected": -4.7231011390686035, + "sft_loss": 3.330408811569214, + "step": 5600 + }, + { + "epoch": 2.9971567151697607, + "eval_logits/chosen": 0.15628661215305328, + "eval_logits/rejected": 0.28109732270240784, + "eval_logps/chosen": -3.3285162448883057, + "eval_logps/rejected": -4.381001949310303, + "eval_loss": 0.5583122968673706, + "eval_rewards/accuracies": 0.7225519418716431, + "eval_rewards/chosen": -3.3285162448883057, + "eval_rewards/margins": 1.0524852275848389, + "eval_rewards/rejected": -4.381001949310303, + "eval_runtime": 51.2808, + "eval_samples_per_second": 26.228, + "eval_sft_loss": 3.47049617767334, + "eval_steps_per_second": 6.572, + "step": 5600 + }, + { + "epoch": 2.999297541394882, + "step": 5604, + "total_flos": 0.0, + "train_loss": 0.5516859670777903, + "train_runtime": 39088.5646, + "train_samples_per_second": 4.589, + "train_steps_per_second": 0.143 + } + ], + "logging_steps": 5, + "max_steps": 5604, + "num_input_tokens_seen": 0, + "num_train_epochs": 3, + "save_steps": 1000000, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 0.0, + "train_batch_size": 2, + "trial_name": null, + "trial_params": null +}