{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.9056603773584906, "eval_steps": 500, "global_step": 120, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.007547169811320755, "grad_norm": 4.023828029632568, "learning_rate": 1.8518518518518518e-07, "logps/chosen": -29.54261589050293, "logps/rejected": -34.2619514465332, "loss": 0.6928, "losses/dpo": 0.6891345977783203, "losses/sft": 1.0977673530578613, "losses/total": 0.6891345977783203, "ref_logps/chosen": -29.5439395904541, "ref_logps/rejected": -34.251976013183594, "rewards/accuracies": 0.515625, "rewards/chosen": 0.00013234722428023815, "rewards/margins": 0.0011300418991595507, "rewards/rejected": -0.0009976944420486689, "step": 1 }, { "epoch": 0.01509433962264151, "grad_norm": 3.774498701095581, "learning_rate": 3.7037037037037036e-07, "logps/chosen": -28.00726890563965, "logps/rejected": -34.31509017944336, "loss": 0.6933, "losses/dpo": 0.6932892799377441, "losses/sft": 1.1102490425109863, "losses/total": 0.6932892799377441, "ref_logps/chosen": -27.994796752929688, "ref_logps/rejected": -34.30337905883789, "rewards/accuracies": 0.4921875, "rewards/chosen": -0.0012474276591092348, "rewards/margins": -7.60925468057394e-05, "rewards/rejected": -0.0011713354615494609, "step": 2 }, { "epoch": 0.022641509433962263, "grad_norm": 4.361927032470703, "learning_rate": 5.555555555555555e-07, "logps/chosen": -32.22264099121094, "logps/rejected": -39.77468490600586, "loss": 0.6923, "losses/dpo": 0.6875869035720825, "losses/sft": 1.1771142482757568, "losses/total": 0.6875869035720825, "ref_logps/chosen": -32.23395919799805, "ref_logps/rejected": -39.76538848876953, "rewards/accuracies": 0.546875, "rewards/chosen": 0.0011316344607621431, "rewards/margins": 0.002061467617750168, "rewards/rejected": -0.0009298332734033465, "step": 3 }, { "epoch": 0.03018867924528302, "grad_norm": 4.030606746673584, "learning_rate": 7.407407407407407e-07, "logps/chosen": -29.95352554321289, "logps/rejected": -35.96381378173828, "loss": 0.6931, "losses/dpo": 0.6984821557998657, "losses/sft": 1.119036078453064, "losses/total": 0.6984821557998657, "ref_logps/chosen": -29.93365478515625, "ref_logps/rejected": -35.9401969909668, "rewards/accuracies": 0.53125, "rewards/chosen": -0.0019873534329235554, "rewards/margins": 0.00037475768476724625, "rewards/rejected": -0.002362110884860158, "step": 4 }, { "epoch": 0.03773584905660377, "grad_norm": 4.286405563354492, "learning_rate": 9.259259259259259e-07, "logps/chosen": -33.54985046386719, "logps/rejected": -38.25576400756836, "loss": 0.6932, "losses/dpo": 0.6964197158813477, "losses/sft": 0.9851586818695068, "losses/total": 0.6964197158813477, "ref_logps/chosen": -33.534912109375, "ref_logps/rejected": -38.2386474609375, "rewards/accuracies": 0.484375, "rewards/chosen": -0.0014938730746507645, "rewards/margins": 0.00021800631657242775, "rewards/rejected": -0.0017118793912231922, "step": 5 }, { "epoch": 0.045283018867924525, "grad_norm": 3.715064764022827, "learning_rate": 1.111111111111111e-06, "logps/chosen": -28.753326416015625, "logps/rejected": -33.94843292236328, "loss": 0.6956, "losses/dpo": 0.696850061416626, "losses/sft": 0.9852187633514404, "losses/total": 0.696850061416626, "ref_logps/chosen": -28.712608337402344, "ref_logps/rejected": -33.953330993652344, "rewards/accuracies": 0.4375, "rewards/chosen": -0.004071743693202734, "rewards/margins": -0.0045617930591106415, "rewards/rejected": 0.0004900495987385511, "step": 6 }, { "epoch": 0.052830188679245285, "grad_norm": 4.02261209487915, "learning_rate": 1.2962962962962962e-06, "logps/chosen": -29.45867347717285, "logps/rejected": -34.641387939453125, "loss": 0.6942, "losses/dpo": 0.6925374269485474, "losses/sft": 1.0857254266738892, "losses/total": 0.6925374269485474, "ref_logps/chosen": -29.440855026245117, "ref_logps/rejected": -34.63925552368164, "rewards/accuracies": 0.4765625, "rewards/chosen": -0.001781845698133111, "rewards/margins": -0.0015682985540479422, "rewards/rejected": -0.00021354644559323788, "step": 7 }, { "epoch": 0.06037735849056604, "grad_norm": 4.2059502601623535, "learning_rate": 1.4814814814814815e-06, "logps/chosen": -34.3134651184082, "logps/rejected": -35.359073638916016, "loss": 0.693, "losses/dpo": 0.6923079490661621, "losses/sft": 1.0715954303741455, "losses/total": 0.6923079490661621, "ref_logps/chosen": -34.33416748046875, "ref_logps/rejected": -35.37400817871094, "rewards/accuracies": 0.5234375, "rewards/chosen": 0.0020700739696621895, "rewards/margins": 0.000576640130020678, "rewards/rejected": 0.001493434072472155, "step": 8 }, { "epoch": 0.06792452830188679, "grad_norm": 5.0255889892578125, "learning_rate": 1.6666666666666667e-06, "logps/chosen": -30.589462280273438, "logps/rejected": -39.17620849609375, "loss": 0.6948, "losses/dpo": 0.6894321441650391, "losses/sft": 0.8859687447547913, "losses/total": 0.6894321441650391, "ref_logps/chosen": -30.609333038330078, "ref_logps/rejected": -39.22538757324219, "rewards/accuracies": 0.5390625, "rewards/chosen": 0.0019869431853294373, "rewards/margins": -0.002930472604930401, "rewards/rejected": 0.004917416721582413, "step": 9 }, { "epoch": 0.07547169811320754, "grad_norm": 4.278906345367432, "learning_rate": 1.8518518518518519e-06, "logps/chosen": -26.85662841796875, "logps/rejected": -34.258155822753906, "loss": 0.6886, "losses/dpo": 0.6904473304748535, "losses/sft": 1.016200304031372, "losses/total": 0.6904473304748535, "ref_logps/chosen": -27.009971618652344, "ref_logps/rejected": -34.316612243652344, "rewards/accuracies": 0.578125, "rewards/chosen": 0.015334523282945156, "rewards/margins": 0.009488685056567192, "rewards/rejected": 0.005845838226377964, "step": 10 }, { "epoch": 0.0830188679245283, "grad_norm": 3.700289487838745, "learning_rate": 2.037037037037037e-06, "logps/chosen": -29.495962142944336, "logps/rejected": -33.246185302734375, "loss": 0.6915, "losses/dpo": 0.6893143653869629, "losses/sft": 1.2568368911743164, "losses/total": 0.6893143653869629, "ref_logps/chosen": -29.658611297607422, "ref_logps/rejected": -33.370635986328125, "rewards/accuracies": 0.5390625, "rewards/chosen": 0.01626480370759964, "rewards/margins": 0.003819521516561508, "rewards/rejected": 0.012445282191038132, "step": 11 }, { "epoch": 0.09056603773584905, "grad_norm": 3.853537082672119, "learning_rate": 2.222222222222222e-06, "logps/chosen": -30.57427215576172, "logps/rejected": -34.356842041015625, "loss": 0.6943, "losses/dpo": 0.702874481678009, "losses/sft": 1.008819580078125, "losses/total": 0.702874481678009, "ref_logps/chosen": -30.768930435180664, "ref_logps/rejected": -34.566131591796875, "rewards/accuracies": 0.5234375, "rewards/chosen": 0.01946595311164856, "rewards/margins": -0.0014625652693212032, "rewards/rejected": 0.0209285207092762, "step": 12 }, { "epoch": 0.09811320754716982, "grad_norm": 4.208248138427734, "learning_rate": 2.4074074074074075e-06, "logps/chosen": -30.429744720458984, "logps/rejected": -39.09281539916992, "loss": 0.6895, "losses/dpo": 0.7104582786560059, "losses/sft": 1.0792236328125, "losses/total": 0.7104582786560059, "ref_logps/chosen": -30.72281265258789, "ref_logps/rejected": -39.29808044433594, "rewards/accuracies": 0.578125, "rewards/chosen": 0.029306560754776, "rewards/margins": 0.00877977255731821, "rewards/rejected": 0.020526789128780365, "step": 13 }, { "epoch": 0.10566037735849057, "grad_norm": 3.7299113273620605, "learning_rate": 2.5925925925925925e-06, "logps/chosen": -27.8311710357666, "logps/rejected": -33.181427001953125, "loss": 0.6862, "losses/dpo": 0.6882991790771484, "losses/sft": 1.0185338258743286, "losses/total": 0.6882991790771484, "ref_logps/chosen": -28.355030059814453, "ref_logps/rejected": -33.544822692871094, "rewards/accuracies": 0.5625, "rewards/chosen": 0.052385710179805756, "rewards/margins": 0.016046730801463127, "rewards/rejected": 0.03633897751569748, "step": 14 }, { "epoch": 0.11320754716981132, "grad_norm": 3.8657593727111816, "learning_rate": 2.7777777777777783e-06, "logps/chosen": -28.884387969970703, "logps/rejected": -34.36138916015625, "loss": 0.6913, "losses/dpo": 0.6939971446990967, "losses/sft": 1.0502238273620605, "losses/total": 0.6939971446990967, "ref_logps/chosen": -29.45929718017578, "ref_logps/rejected": -34.86326599121094, "rewards/accuracies": 0.5703125, "rewards/chosen": 0.05749073624610901, "rewards/margins": 0.0073026856407523155, "rewards/rejected": 0.05018804967403412, "step": 15 }, { "epoch": 0.12075471698113208, "grad_norm": 3.9700570106506348, "learning_rate": 2.962962962962963e-06, "logps/chosen": -30.124591827392578, "logps/rejected": -31.684345245361328, "loss": 0.6928, "losses/dpo": 0.6692728996276855, "losses/sft": 1.005508542060852, "losses/total": 0.6692728996276855, "ref_logps/chosen": -30.774919509887695, "ref_logps/rejected": -32.27117919921875, "rewards/accuracies": 0.515625, "rewards/chosen": 0.06503286957740784, "rewards/margins": 0.00634903647005558, "rewards/rejected": 0.058683834969997406, "step": 16 }, { "epoch": 0.12830188679245283, "grad_norm": 3.9416866302490234, "learning_rate": 3.1481481481481483e-06, "logps/chosen": -30.775650024414062, "logps/rejected": -34.58787155151367, "loss": 0.6834, "losses/dpo": 0.7104189395904541, "losses/sft": 0.9935987591743469, "losses/total": 0.7104189395904541, "ref_logps/chosen": -31.45960235595703, "ref_logps/rejected": -35.01028060913086, "rewards/accuracies": 0.5625, "rewards/chosen": 0.0683954730629921, "rewards/margins": 0.0261550135910511, "rewards/rejected": 0.042240455746650696, "step": 17 }, { "epoch": 0.13584905660377358, "grad_norm": 3.8401975631713867, "learning_rate": 3.3333333333333333e-06, "logps/chosen": -29.183441162109375, "logps/rejected": -33.91481018066406, "loss": 0.6689, "losses/dpo": 0.6731781363487244, "losses/sft": 1.0918514728546143, "losses/total": 0.6731781363487244, "ref_logps/chosen": -30.14259147644043, "ref_logps/rejected": -34.299861907958984, "rewards/accuracies": 0.6328125, "rewards/chosen": 0.09591513872146606, "rewards/margins": 0.057410385459661484, "rewards/rejected": 0.03850475698709488, "step": 18 }, { "epoch": 0.14339622641509434, "grad_norm": 3.793499231338501, "learning_rate": 3.5185185185185187e-06, "logps/chosen": -25.331146240234375, "logps/rejected": -34.978057861328125, "loss": 0.6692, "losses/dpo": 0.7569133639335632, "losses/sft": 0.9651777744293213, "losses/total": 0.7569133639335632, "ref_logps/chosen": -26.447914123535156, "ref_logps/rejected": -35.43275833129883, "rewards/accuracies": 0.65625, "rewards/chosen": 0.1116766482591629, "rewards/margins": 0.06620671600103378, "rewards/rejected": 0.04546992480754852, "step": 19 }, { "epoch": 0.1509433962264151, "grad_norm": 3.7824015617370605, "learning_rate": 3.7037037037037037e-06, "logps/chosen": -29.415006637573242, "logps/rejected": -34.36804962158203, "loss": 0.6666, "losses/dpo": 0.6621171832084656, "losses/sft": 0.7287149429321289, "losses/total": 0.6621171832084656, "ref_logps/chosen": -30.477230072021484, "ref_logps/rejected": -34.688987731933594, "rewards/accuracies": 0.6015625, "rewards/chosen": 0.10622246563434601, "rewards/margins": 0.0741288959980011, "rewards/rejected": 0.03209357336163521, "step": 20 }, { "epoch": 0.15849056603773584, "grad_norm": 4.047840595245361, "learning_rate": 3.88888888888889e-06, "logps/chosen": -29.250553131103516, "logps/rejected": -39.266815185546875, "loss": 0.6475, "losses/dpo": 0.647861123085022, "losses/sft": 0.9502246379852295, "losses/total": 0.647861123085022, "ref_logps/chosen": -30.495431900024414, "ref_logps/rejected": -39.307708740234375, "rewards/accuracies": 0.609375, "rewards/chosen": 0.12448810040950775, "rewards/margins": 0.12039919942617416, "rewards/rejected": 0.0040889037773013115, "step": 21 }, { "epoch": 0.1660377358490566, "grad_norm": 4.19422721862793, "learning_rate": 4.074074074074074e-06, "logps/chosen": -27.82355308532715, "logps/rejected": -35.92847442626953, "loss": 0.6735, "losses/dpo": 0.5619722604751587, "losses/sft": 0.8959212899208069, "losses/total": 0.5619722604751587, "ref_logps/chosen": -29.235668182373047, "ref_logps/rejected": -36.55792999267578, "rewards/accuracies": 0.5859375, "rewards/chosen": 0.14121143519878387, "rewards/margins": 0.07826600223779678, "rewards/rejected": 0.06294544041156769, "step": 22 }, { "epoch": 0.17358490566037735, "grad_norm": 3.807786464691162, "learning_rate": 4.2592592592592596e-06, "logps/chosen": -27.979816436767578, "logps/rejected": -38.823341369628906, "loss": 0.6627, "losses/dpo": 0.6460346579551697, "losses/sft": 1.1445313692092896, "losses/total": 0.6460346579551697, "ref_logps/chosen": -28.973108291625977, "ref_logps/rejected": -38.76581573486328, "rewards/accuracies": 0.5859375, "rewards/chosen": 0.09932918101549149, "rewards/margins": 0.10508126765489578, "rewards/rejected": -0.005752084776759148, "step": 23 }, { "epoch": 0.1811320754716981, "grad_norm": 4.215601921081543, "learning_rate": 4.444444444444444e-06, "logps/chosen": -30.01553726196289, "logps/rejected": -36.99456787109375, "loss": 0.6726, "losses/dpo": 0.7518600225448608, "losses/sft": 1.1231310367584229, "losses/total": 0.7518600225448608, "ref_logps/chosen": -30.876399993896484, "ref_logps/rejected": -37.021759033203125, "rewards/accuracies": 0.625, "rewards/chosen": 0.08608602732419968, "rewards/margins": 0.08336685597896576, "rewards/rejected": 0.0027191713452339172, "step": 24 }, { "epoch": 0.18867924528301888, "grad_norm": 3.991433620452881, "learning_rate": 4.62962962962963e-06, "logps/chosen": -29.20758056640625, "logps/rejected": -33.56412124633789, "loss": 0.6631, "losses/dpo": 0.5801557302474976, "losses/sft": 1.1197171211242676, "losses/total": 0.5801557302474976, "ref_logps/chosen": -30.49410629272461, "ref_logps/rejected": -33.72259521484375, "rewards/accuracies": 0.6171875, "rewards/chosen": 0.1286524534225464, "rewards/margins": 0.11280516535043716, "rewards/rejected": 0.01584728993475437, "step": 25 }, { "epoch": 0.19622641509433963, "grad_norm": 4.136443614959717, "learning_rate": 4.814814814814815e-06, "logps/chosen": -27.908275604248047, "logps/rejected": -36.784366607666016, "loss": 0.6475, "losses/dpo": 0.7332102060317993, "losses/sft": 0.7210355401039124, "losses/total": 0.7332102060317993, "ref_logps/chosen": -29.110748291015625, "ref_logps/rejected": -36.522212982177734, "rewards/accuracies": 0.6796875, "rewards/chosen": 0.12024737149477005, "rewards/margins": 0.14646272361278534, "rewards/rejected": -0.026215344667434692, "step": 26 }, { "epoch": 0.2037735849056604, "grad_norm": 3.6287527084350586, "learning_rate": 5e-06, "logps/chosen": -25.643430709838867, "logps/rejected": -33.46253204345703, "loss": 0.603, "losses/dpo": 0.6435042023658752, "losses/sft": 0.7656551599502563, "losses/total": 0.6435042023658752, "ref_logps/chosen": -27.53249740600586, "ref_logps/rejected": -32.8140869140625, "rewards/accuracies": 0.671875, "rewards/chosen": 0.18890666961669922, "rewards/margins": 0.25375133752822876, "rewards/rejected": -0.06484466791152954, "step": 27 }, { "epoch": 0.21132075471698114, "grad_norm": 5.5130720138549805, "learning_rate": 4.978902953586498e-06, "logps/chosen": -32.41269302368164, "logps/rejected": -36.27760314941406, "loss": 0.6718, "losses/dpo": 0.7667650580406189, "losses/sft": 1.180605411529541, "losses/total": 0.7667650580406189, "ref_logps/chosen": -32.947940826416016, "ref_logps/rejected": -35.710548400878906, "rewards/accuracies": 0.625, "rewards/chosen": 0.05352487415075302, "rewards/margins": 0.11023038625717163, "rewards/rejected": -0.05670551210641861, "step": 28 }, { "epoch": 0.2188679245283019, "grad_norm": 4.2082695960998535, "learning_rate": 4.957805907172996e-06, "logps/chosen": -30.38204574584961, "logps/rejected": -38.79253387451172, "loss": 0.6259, "losses/dpo": 0.6825762987136841, "losses/sft": 1.3010542392730713, "losses/total": 0.6825762987136841, "ref_logps/chosen": -31.19179916381836, "ref_logps/rejected": -37.440677642822266, "rewards/accuracies": 0.6640625, "rewards/chosen": 0.08097504824399948, "rewards/margins": 0.21616099774837494, "rewards/rejected": -0.13518595695495605, "step": 29 }, { "epoch": 0.22641509433962265, "grad_norm": 4.6551737785339355, "learning_rate": 4.936708860759495e-06, "logps/chosen": -30.135501861572266, "logps/rejected": -37.48721694946289, "loss": 0.6665, "losses/dpo": 0.5913557410240173, "losses/sft": 1.0625046491622925, "losses/total": 0.5913557410240173, "ref_logps/chosen": -30.550817489624023, "ref_logps/rejected": -36.66627502441406, "rewards/accuracies": 0.5859375, "rewards/chosen": 0.04153158515691757, "rewards/margins": 0.12362557649612427, "rewards/rejected": -0.08209399878978729, "step": 30 }, { "epoch": 0.2339622641509434, "grad_norm": 4.481552600860596, "learning_rate": 4.915611814345992e-06, "logps/chosen": -29.738265991210938, "logps/rejected": -40.146636962890625, "loss": 0.6341, "losses/dpo": 0.6154491305351257, "losses/sft": 0.7811817526817322, "losses/total": 0.6154491305351257, "ref_logps/chosen": -29.985429763793945, "ref_logps/rejected": -38.49150085449219, "rewards/accuracies": 0.625, "rewards/chosen": 0.02471642568707466, "rewards/margins": 0.19022998213768005, "rewards/rejected": -0.1655135601758957, "step": 31 }, { "epoch": 0.24150943396226415, "grad_norm": 4.287616729736328, "learning_rate": 4.89451476793249e-06, "logps/chosen": -28.966516494750977, "logps/rejected": -38.64032745361328, "loss": 0.6213, "losses/dpo": 0.5834950804710388, "losses/sft": 1.19804847240448, "losses/total": 0.5834950804710388, "ref_logps/chosen": -29.167491912841797, "ref_logps/rejected": -36.74009704589844, "rewards/accuracies": 0.671875, "rewards/chosen": 0.0200975239276886, "rewards/margins": 0.21012060344219208, "rewards/rejected": -0.19002306461334229, "step": 32 }, { "epoch": 0.2490566037735849, "grad_norm": 4.030296325683594, "learning_rate": 4.873417721518987e-06, "logps/chosen": -26.15782928466797, "logps/rejected": -33.3205451965332, "loss": 0.6113, "losses/dpo": 0.5751947164535522, "losses/sft": 0.7719302177429199, "losses/total": 0.5751947164535522, "ref_logps/chosen": -26.509279251098633, "ref_logps/rejected": -31.373973846435547, "rewards/accuracies": 0.71875, "rewards/chosen": 0.03514501452445984, "rewards/margins": 0.22980214655399323, "rewards/rejected": -0.1946571320295334, "step": 33 }, { "epoch": 0.25660377358490566, "grad_norm": 4.367461204528809, "learning_rate": 4.852320675105486e-06, "logps/chosen": -32.38875961303711, "logps/rejected": -40.95237350463867, "loss": 0.6236, "losses/dpo": 0.6270190477371216, "losses/sft": 1.1532888412475586, "losses/total": 0.6270190477371216, "ref_logps/chosen": -31.350940704345703, "ref_logps/rejected": -37.58573913574219, "rewards/accuracies": 0.71875, "rewards/chosen": -0.10378223657608032, "rewards/margins": 0.2328808605670929, "rewards/rejected": -0.3366630971431732, "step": 34 }, { "epoch": 0.2641509433962264, "grad_norm": 4.614798545837402, "learning_rate": 4.831223628691984e-06, "logps/chosen": -32.56526184082031, "logps/rejected": -40.50109100341797, "loss": 0.6059, "losses/dpo": 0.7241290211677551, "losses/sft": 1.3340615034103394, "losses/total": 0.7241290211677551, "ref_logps/chosen": -30.577272415161133, "ref_logps/rejected": -35.84385681152344, "rewards/accuracies": 0.6796875, "rewards/chosen": -0.19879914820194244, "rewards/margins": 0.26692429184913635, "rewards/rejected": -0.46572345495224, "step": 35 }, { "epoch": 0.27169811320754716, "grad_norm": 4.4597649574279785, "learning_rate": 4.8101265822784815e-06, "logps/chosen": -34.825111389160156, "logps/rejected": -44.15177917480469, "loss": 0.5946, "losses/dpo": 0.5129742622375488, "losses/sft": 0.856516420841217, "losses/total": 0.5129742622375488, "ref_logps/chosen": -32.00858688354492, "ref_logps/rejected": -38.221412658691406, "rewards/accuracies": 0.6953125, "rewards/chosen": -0.2816521227359772, "rewards/margins": 0.31138482689857483, "rewards/rejected": -0.593036949634552, "step": 36 }, { "epoch": 0.2792452830188679, "grad_norm": 4.562221527099609, "learning_rate": 4.789029535864979e-06, "logps/chosen": -33.27934646606445, "logps/rejected": -39.6660041809082, "loss": 0.598, "losses/dpo": 0.6043655276298523, "losses/sft": 1.0378785133361816, "losses/total": 0.6043655276298523, "ref_logps/chosen": -29.917388916015625, "ref_logps/rejected": -33.3465690612793, "rewards/accuracies": 0.6484375, "rewards/chosen": -0.33619576692581177, "rewards/margins": 0.29574763774871826, "rewards/rejected": -0.6319433450698853, "step": 37 }, { "epoch": 0.28679245283018867, "grad_norm": 5.42985725402832, "learning_rate": 4.767932489451477e-06, "logps/chosen": -36.16193389892578, "logps/rejected": -43.71050262451172, "loss": 0.6555, "losses/dpo": 0.6992093324661255, "losses/sft": 1.3419675827026367, "losses/total": 0.6992093324661255, "ref_logps/chosen": -31.39295196533203, "ref_logps/rejected": -36.14948654174805, "rewards/accuracies": 0.65625, "rewards/chosen": -0.47689831256866455, "rewards/margins": 0.2792032063007355, "rewards/rejected": -0.7561015486717224, "step": 38 }, { "epoch": 0.2943396226415094, "grad_norm": 4.9820051193237305, "learning_rate": 4.746835443037975e-06, "logps/chosen": -33.79109191894531, "logps/rejected": -42.09747314453125, "loss": 0.5883, "losses/dpo": 0.6338505148887634, "losses/sft": 1.2124994993209839, "losses/total": 0.6338505148887634, "ref_logps/chosen": -28.94110107421875, "ref_logps/rejected": -33.52173614501953, "rewards/accuracies": 0.7109375, "rewards/chosen": -0.4849993586540222, "rewards/margins": 0.3725742697715759, "rewards/rejected": -0.8575735688209534, "step": 39 }, { "epoch": 0.3018867924528302, "grad_norm": 5.098467826843262, "learning_rate": 4.725738396624473e-06, "logps/chosen": -34.21745681762695, "logps/rejected": -43.00538635253906, "loss": 0.6283, "losses/dpo": 0.7655435800552368, "losses/sft": 1.7303447723388672, "losses/total": 0.7655435800552368, "ref_logps/chosen": -27.329925537109375, "ref_logps/rejected": -33.21820068359375, "rewards/accuracies": 0.671875, "rewards/chosen": -0.6887531876564026, "rewards/margins": 0.2899653911590576, "rewards/rejected": -0.9787185788154602, "step": 40 }, { "epoch": 0.30943396226415093, "grad_norm": 4.568510055541992, "learning_rate": 4.7046413502109714e-06, "logps/chosen": -34.30303955078125, "logps/rejected": -45.21646499633789, "loss": 0.5376, "losses/dpo": 0.466990202665329, "losses/sft": 1.3411931991577148, "losses/total": 0.466990202665329, "ref_logps/chosen": -27.724037170410156, "ref_logps/rejected": -33.146297454833984, "rewards/accuracies": 0.7265625, "rewards/chosen": -0.657900333404541, "rewards/margins": 0.5491164326667786, "rewards/rejected": -1.2070167064666748, "step": 41 }, { "epoch": 0.3169811320754717, "grad_norm": 4.940774440765381, "learning_rate": 4.683544303797468e-06, "logps/chosen": -37.19551086425781, "logps/rejected": -48.22083282470703, "loss": 0.5488, "losses/dpo": 0.7535874843597412, "losses/sft": 1.5672276020050049, "losses/total": 0.7535874843597412, "ref_logps/chosen": -30.618778228759766, "ref_logps/rejected": -36.40138244628906, "rewards/accuracies": 0.7421875, "rewards/chosen": -0.6576732993125916, "rewards/margins": 0.5242711901664734, "rewards/rejected": -1.181944489479065, "step": 42 }, { "epoch": 0.32452830188679244, "grad_norm": 5.207009792327881, "learning_rate": 4.662447257383967e-06, "logps/chosen": -38.67348861694336, "logps/rejected": -46.259944915771484, "loss": 0.6086, "losses/dpo": 0.580439567565918, "losses/sft": 1.5248361825942993, "losses/total": 0.580439567565918, "ref_logps/chosen": -30.998300552368164, "ref_logps/rejected": -34.962703704833984, "rewards/accuracies": 0.671875, "rewards/chosen": -0.767518937587738, "rewards/margins": 0.3622052073478699, "rewards/rejected": -1.129724144935608, "step": 43 }, { "epoch": 0.3320754716981132, "grad_norm": 5.374455451965332, "learning_rate": 4.641350210970465e-06, "logps/chosen": -35.60981750488281, "logps/rejected": -45.90471649169922, "loss": 0.6258, "losses/dpo": 0.7469555139541626, "losses/sft": 1.5075256824493408, "losses/total": 0.7469555139541626, "ref_logps/chosen": -27.66849136352539, "ref_logps/rejected": -34.36457824707031, "rewards/accuracies": 0.6484375, "rewards/chosen": -0.79413241147995, "rewards/margins": 0.35988110303878784, "rewards/rejected": -1.1540133953094482, "step": 44 }, { "epoch": 0.33962264150943394, "grad_norm": 5.304266452789307, "learning_rate": 4.620253164556963e-06, "logps/chosen": -38.42974090576172, "logps/rejected": -45.68645477294922, "loss": 0.6133, "losses/dpo": 0.6757426261901855, "losses/sft": 1.243510127067566, "losses/total": 0.6757426261901855, "ref_logps/chosen": -29.328536987304688, "ref_logps/rejected": -32.262840270996094, "rewards/accuracies": 0.6796875, "rewards/chosen": -0.9101204872131348, "rewards/margins": 0.4322410821914673, "rewards/rejected": -1.342361569404602, "step": 45 }, { "epoch": 0.3471698113207547, "grad_norm": 5.428191184997559, "learning_rate": 4.5991561181434605e-06, "logps/chosen": -38.54790115356445, "logps/rejected": -49.04346466064453, "loss": 0.5944, "losses/dpo": 0.7710250020027161, "losses/sft": 1.5200599431991577, "losses/total": 0.7710250020027161, "ref_logps/chosen": -29.571321487426758, "ref_logps/rejected": -35.5488166809082, "rewards/accuracies": 0.671875, "rewards/chosen": -0.897658109664917, "rewards/margins": 0.45180660486221313, "rewards/rejected": -1.3494646549224854, "step": 46 }, { "epoch": 0.35471698113207545, "grad_norm": 5.276749610900879, "learning_rate": 4.578059071729958e-06, "logps/chosen": -42.278629302978516, "logps/rejected": -49.082420349121094, "loss": 0.5844, "losses/dpo": 0.5217583179473877, "losses/sft": 1.251055121421814, "losses/total": 0.5217583179473877, "ref_logps/chosen": -32.70330047607422, "ref_logps/rejected": -34.48522186279297, "rewards/accuracies": 0.7734375, "rewards/chosen": -0.9575330018997192, "rewards/margins": 0.5021871328353882, "rewards/rejected": -1.4597201347351074, "step": 47 }, { "epoch": 0.3622641509433962, "grad_norm": 4.85624361038208, "learning_rate": 4.556962025316456e-06, "logps/chosen": -37.618473052978516, "logps/rejected": -49.08882141113281, "loss": 0.5444, "losses/dpo": 0.4470449984073639, "losses/sft": 1.1268991231918335, "losses/total": 0.4470449984073639, "ref_logps/chosen": -29.47239112854004, "ref_logps/rejected": -34.867340087890625, "rewards/accuracies": 0.734375, "rewards/chosen": -0.8146083354949951, "rewards/margins": 0.6075396537780762, "rewards/rejected": -1.4221479892730713, "step": 48 }, { "epoch": 0.36981132075471695, "grad_norm": 5.045034408569336, "learning_rate": 4.535864978902954e-06, "logps/chosen": -33.628997802734375, "logps/rejected": -46.788063049316406, "loss": 0.5441, "losses/dpo": 0.4152263402938843, "losses/sft": 1.2738722562789917, "losses/total": 0.4152263402938843, "ref_logps/chosen": -25.57724952697754, "ref_logps/rejected": -33.02429962158203, "rewards/accuracies": 0.7265625, "rewards/chosen": -0.8051748275756836, "rewards/margins": 0.5712013840675354, "rewards/rejected": -1.3763761520385742, "step": 49 }, { "epoch": 0.37735849056603776, "grad_norm": 4.926025867462158, "learning_rate": 4.514767932489452e-06, "logps/chosen": -40.1754264831543, "logps/rejected": -53.34943389892578, "loss": 0.5357, "losses/dpo": 0.42820218205451965, "losses/sft": 1.4205973148345947, "losses/total": 0.42820218205451965, "ref_logps/chosen": -30.12688636779785, "ref_logps/rejected": -36.76472091674805, "rewards/accuracies": 0.7421875, "rewards/chosen": -1.0048537254333496, "rewards/margins": 0.6536170840263367, "rewards/rejected": -1.658470869064331, "step": 50 }, { "epoch": 0.3849056603773585, "grad_norm": 5.366219997406006, "learning_rate": 4.4936708860759495e-06, "logps/chosen": -41.36455535888672, "logps/rejected": -51.32182312011719, "loss": 0.5498, "losses/dpo": 0.5175436735153198, "losses/sft": 1.3962128162384033, "losses/total": 0.5175436735153198, "ref_logps/chosen": -31.545040130615234, "ref_logps/rejected": -34.96824264526367, "rewards/accuracies": 0.6875, "rewards/chosen": -0.9819516539573669, "rewards/margins": 0.6534063220024109, "rewards/rejected": -1.6353578567504883, "step": 51 }, { "epoch": 0.39245283018867927, "grad_norm": 5.386270523071289, "learning_rate": 4.472573839662447e-06, "logps/chosen": -37.03908920288086, "logps/rejected": -50.11444854736328, "loss": 0.5742, "losses/dpo": 0.8617876768112183, "losses/sft": 2.0467562675476074, "losses/total": 0.8617876768112183, "ref_logps/chosen": -27.370098114013672, "ref_logps/rejected": -34.820045471191406, "rewards/accuracies": 0.7109375, "rewards/chosen": -0.9668989181518555, "rewards/margins": 0.5625417232513428, "rewards/rejected": -1.5294406414031982, "step": 52 }, { "epoch": 0.4, "grad_norm": 4.991089820861816, "learning_rate": 4.451476793248945e-06, "logps/chosen": -36.202457427978516, "logps/rejected": -46.09682083129883, "loss": 0.5363, "losses/dpo": 0.4159512221813202, "losses/sft": 1.2818732261657715, "losses/total": 0.4159512221813202, "ref_logps/chosen": -27.4339542388916, "ref_logps/rejected": -30.41155433654785, "rewards/accuracies": 0.734375, "rewards/chosen": -0.8768501281738281, "rewards/margins": 0.6916766166687012, "rewards/rejected": -1.5685267448425293, "step": 53 }, { "epoch": 0.4075471698113208, "grad_norm": 5.318048000335693, "learning_rate": 4.430379746835443e-06, "logps/chosen": -40.78857421875, "logps/rejected": -48.64263153076172, "loss": 0.5552, "losses/dpo": 0.5250239372253418, "losses/sft": 1.3805458545684814, "losses/total": 0.5250239372253418, "ref_logps/chosen": -31.36919403076172, "ref_logps/rejected": -32.82971954345703, "rewards/accuracies": 0.6640625, "rewards/chosen": -0.9419378638267517, "rewards/margins": 0.6393535733222961, "rewards/rejected": -1.5812914371490479, "step": 54 }, { "epoch": 0.41509433962264153, "grad_norm": 5.705236911773682, "learning_rate": 4.409282700421942e-06, "logps/chosen": -39.51799011230469, "logps/rejected": -50.53668975830078, "loss": 0.5586, "losses/dpo": 0.7799692153930664, "losses/sft": 1.2691373825073242, "losses/total": 0.7799692153930664, "ref_logps/chosen": -29.568626403808594, "ref_logps/rejected": -34.02891540527344, "rewards/accuracies": 0.6953125, "rewards/chosen": -0.9949361681938171, "rewards/margins": 0.6558418273925781, "rewards/rejected": -1.6507779359817505, "step": 55 }, { "epoch": 0.4226415094339623, "grad_norm": 5.554488658905029, "learning_rate": 4.3881856540084394e-06, "logps/chosen": -38.21480941772461, "logps/rejected": -47.99286651611328, "loss": 0.5795, "losses/dpo": 0.5031176805496216, "losses/sft": 1.3220183849334717, "losses/total": 0.5031176805496216, "ref_logps/chosen": -29.2410888671875, "ref_logps/rejected": -33.15393829345703, "rewards/accuracies": 0.703125, "rewards/chosen": -0.8973721265792847, "rewards/margins": 0.5865209698677063, "rewards/rejected": -1.4838931560516357, "step": 56 }, { "epoch": 0.43018867924528303, "grad_norm": 5.692888259887695, "learning_rate": 4.367088607594937e-06, "logps/chosen": -41.21876907348633, "logps/rejected": -53.25712585449219, "loss": 0.5296, "losses/dpo": 0.4523683488368988, "losses/sft": 1.5277466773986816, "losses/total": 0.4523683488368988, "ref_logps/chosen": -31.35172462463379, "ref_logps/rejected": -35.32482147216797, "rewards/accuracies": 0.7421875, "rewards/chosen": -0.9867046475410461, "rewards/margins": 0.806525707244873, "rewards/rejected": -1.7932302951812744, "step": 57 }, { "epoch": 0.4377358490566038, "grad_norm": 5.991930961608887, "learning_rate": 4.345991561181435e-06, "logps/chosen": -39.15966033935547, "logps/rejected": -50.0914306640625, "loss": 0.5912, "losses/dpo": 0.8516952991485596, "losses/sft": 1.4551334381103516, "losses/total": 0.8516952991485596, "ref_logps/chosen": -28.90890884399414, "ref_logps/rejected": -32.83641052246094, "rewards/accuracies": 0.6953125, "rewards/chosen": -1.0250749588012695, "rewards/margins": 0.7004267573356628, "rewards/rejected": -1.7255017757415771, "step": 58 }, { "epoch": 0.44528301886792454, "grad_norm": 5.214711666107178, "learning_rate": 4.324894514767933e-06, "logps/chosen": -40.67414474487305, "logps/rejected": -57.66020584106445, "loss": 0.4895, "losses/dpo": 0.5767215490341187, "losses/sft": 1.3480836153030396, "losses/total": 0.5767215490341187, "ref_logps/chosen": -31.317989349365234, "ref_logps/rejected": -38.934444427490234, "rewards/accuracies": 0.7578125, "rewards/chosen": -0.9356154799461365, "rewards/margins": 0.93696129322052, "rewards/rejected": -1.8725767135620117, "step": 59 }, { "epoch": 0.4528301886792453, "grad_norm": 6.370638847351074, "learning_rate": 4.303797468354431e-06, "logps/chosen": -40.94059753417969, "logps/rejected": -51.065185546875, "loss": 0.6121, "losses/dpo": 0.4795827865600586, "losses/sft": 1.4322285652160645, "losses/total": 0.4795827865600586, "ref_logps/chosen": -29.871326446533203, "ref_logps/rejected": -34.67937469482422, "rewards/accuracies": 0.703125, "rewards/chosen": -1.1069271564483643, "rewards/margins": 0.5316535830497742, "rewards/rejected": -1.6385807991027832, "step": 60 }, { "epoch": 0.46037735849056605, "grad_norm": 6.262545585632324, "learning_rate": 4.2827004219409285e-06, "logps/chosen": -39.27662658691406, "logps/rejected": -55.63613510131836, "loss": 0.5328, "losses/dpo": 0.596257746219635, "losses/sft": 1.5779916048049927, "losses/total": 0.596257746219635, "ref_logps/chosen": -29.888713836669922, "ref_logps/rejected": -38.11284637451172, "rewards/accuracies": 0.7734375, "rewards/chosen": -0.9387915134429932, "rewards/margins": 0.8135374784469604, "rewards/rejected": -1.752328872680664, "step": 61 }, { "epoch": 0.4679245283018868, "grad_norm": 5.742541313171387, "learning_rate": 4.261603375527426e-06, "logps/chosen": -40.12831497192383, "logps/rejected": -48.92723846435547, "loss": 0.5512, "losses/dpo": 0.6246651411056519, "losses/sft": 1.4945807456970215, "losses/total": 0.6246651411056519, "ref_logps/chosen": -31.467824935913086, "ref_logps/rejected": -33.796714782714844, "rewards/accuracies": 0.7109375, "rewards/chosen": -0.86604905128479, "rewards/margins": 0.6470035314559937, "rewards/rejected": -1.5130527019500732, "step": 62 }, { "epoch": 0.47547169811320755, "grad_norm": 5.606804370880127, "learning_rate": 4.240506329113924e-06, "logps/chosen": -38.19676971435547, "logps/rejected": -51.48821258544922, "loss": 0.5277, "losses/dpo": 0.6507617235183716, "losses/sft": 1.7701728343963623, "losses/total": 0.6507617235183716, "ref_logps/chosen": -28.915279388427734, "ref_logps/rejected": -34.66787338256836, "rewards/accuracies": 0.7421875, "rewards/chosen": -0.9281493425369263, "rewards/margins": 0.7538847327232361, "rewards/rejected": -1.6820340156555176, "step": 63 }, { "epoch": 0.4830188679245283, "grad_norm": 5.0319647789001465, "learning_rate": 4.219409282700423e-06, "logps/chosen": -36.99481964111328, "logps/rejected": -49.86357116699219, "loss": 0.5064, "losses/dpo": 0.3545170724391937, "losses/sft": 1.4933311939239502, "losses/total": 0.3545170724391937, "ref_logps/chosen": -27.901212692260742, "ref_logps/rejected": -32.68770980834961, "rewards/accuracies": 0.7265625, "rewards/chosen": -0.9093605875968933, "rewards/margins": 0.8082252144813538, "rewards/rejected": -1.717585802078247, "step": 64 }, { "epoch": 0.49056603773584906, "grad_norm": 5.624882698059082, "learning_rate": 4.19831223628692e-06, "logps/chosen": -41.812522888183594, "logps/rejected": -50.7585334777832, "loss": 0.5876, "losses/dpo": 0.7280862331390381, "losses/sft": 1.730393648147583, "losses/total": 0.7280862331390381, "ref_logps/chosen": -32.30836868286133, "ref_logps/rejected": -35.13983917236328, "rewards/accuracies": 0.734375, "rewards/chosen": -0.9504156708717346, "rewards/margins": 0.6114538908004761, "rewards/rejected": -1.5618693828582764, "step": 65 }, { "epoch": 0.4981132075471698, "grad_norm": 5.693253517150879, "learning_rate": 4.177215189873418e-06, "logps/chosen": -41.0583381652832, "logps/rejected": -57.39263153076172, "loss": 0.4943, "losses/dpo": 0.4355122745037079, "losses/sft": 1.1784332990646362, "losses/total": 0.4355122745037079, "ref_logps/chosen": -31.086488723754883, "ref_logps/rejected": -38.95460891723633, "rewards/accuracies": 0.7734375, "rewards/chosen": -0.9971848726272583, "rewards/margins": 0.8466169834136963, "rewards/rejected": -1.8438018560409546, "step": 66 }, { "epoch": 0.5056603773584906, "grad_norm": 7.065197944641113, "learning_rate": 4.156118143459915e-06, "logps/chosen": -39.25422668457031, "logps/rejected": -50.87791442871094, "loss": 0.5339, "losses/dpo": 0.8317179083824158, "losses/sft": 1.3175883293151855, "losses/total": 0.8317179083824158, "ref_logps/chosen": -28.742538452148438, "ref_logps/rejected": -33.21300506591797, "rewards/accuracies": 0.71875, "rewards/chosen": -1.0511683225631714, "rewards/margins": 0.7153225541114807, "rewards/rejected": -1.7664909362792969, "step": 67 }, { "epoch": 0.5132075471698113, "grad_norm": 5.2859907150268555, "learning_rate": 4.135021097046414e-06, "logps/chosen": -38.8599853515625, "logps/rejected": -53.00445556640625, "loss": 0.4691, "losses/dpo": 0.5039323568344116, "losses/sft": 0.7760534286499023, "losses/total": 0.5039323568344116, "ref_logps/chosen": -30.572181701660156, "ref_logps/rejected": -35.3000602722168, "rewards/accuracies": 0.78125, "rewards/chosen": -0.828780472278595, "rewards/margins": 0.9416592121124268, "rewards/rejected": -1.770439863204956, "step": 68 }, { "epoch": 0.5207547169811321, "grad_norm": 5.869091033935547, "learning_rate": 4.113924050632912e-06, "logps/chosen": -39.38776397705078, "logps/rejected": -56.88161849975586, "loss": 0.5432, "losses/dpo": 0.28467729687690735, "losses/sft": 1.18190336227417, "losses/total": 0.28467729687690735, "ref_logps/chosen": -28.762392044067383, "ref_logps/rejected": -37.78593063354492, "rewards/accuracies": 0.7109375, "rewards/chosen": -1.0625369548797607, "rewards/margins": 0.8470318913459778, "rewards/rejected": -1.9095687866210938, "step": 69 }, { "epoch": 0.5283018867924528, "grad_norm": 5.994374752044678, "learning_rate": 4.09282700421941e-06, "logps/chosen": -39.55097579956055, "logps/rejected": -50.52848815917969, "loss": 0.5564, "losses/dpo": 0.6404341459274292, "losses/sft": 1.6006711721420288, "losses/total": 0.6404341459274292, "ref_logps/chosen": -29.427146911621094, "ref_logps/rejected": -32.81227111816406, "rewards/accuracies": 0.7109375, "rewards/chosen": -1.0123828649520874, "rewards/margins": 0.759239137172699, "rewards/rejected": -1.7716220617294312, "step": 70 }, { "epoch": 0.5358490566037736, "grad_norm": 6.391396999359131, "learning_rate": 4.0717299578059074e-06, "logps/chosen": -39.2120475769043, "logps/rejected": -53.28424072265625, "loss": 0.5533, "losses/dpo": 0.35934409499168396, "losses/sft": 1.446973443031311, "losses/total": 0.35934409499168396, "ref_logps/chosen": -28.753549575805664, "ref_logps/rejected": -34.92060470581055, "rewards/accuracies": 0.71875, "rewards/chosen": -1.0458496809005737, "rewards/margins": 0.790514349937439, "rewards/rejected": -1.8363640308380127, "step": 71 }, { "epoch": 0.5433962264150943, "grad_norm": 6.23837423324585, "learning_rate": 4.050632911392405e-06, "logps/chosen": -41.26081466674805, "logps/rejected": -57.397159576416016, "loss": 0.5566, "losses/dpo": 0.6022348403930664, "losses/sft": 1.4900131225585938, "losses/total": 0.6022348403930664, "ref_logps/chosen": -29.823455810546875, "ref_logps/rejected": -38.314422607421875, "rewards/accuracies": 0.6953125, "rewards/chosen": -1.1437358856201172, "rewards/margins": 0.7645378112792969, "rewards/rejected": -1.908273696899414, "step": 72 }, { "epoch": 0.5509433962264151, "grad_norm": 5.54686164855957, "learning_rate": 4.029535864978903e-06, "logps/chosen": -40.462684631347656, "logps/rejected": -56.720314025878906, "loss": 0.4834, "losses/dpo": 0.5470014810562134, "losses/sft": 1.4223295450210571, "losses/total": 0.5470014810562134, "ref_logps/chosen": -30.118234634399414, "ref_logps/rejected": -36.64280700683594, "rewards/accuracies": 0.75, "rewards/chosen": -1.0344449281692505, "rewards/margins": 0.9733060002326965, "rewards/rejected": -2.007750988006592, "step": 73 }, { "epoch": 0.5584905660377358, "grad_norm": 6.6403656005859375, "learning_rate": 4.008438818565401e-06, "logps/chosen": -43.17585754394531, "logps/rejected": -54.99962615966797, "loss": 0.6214, "losses/dpo": 0.8616761565208435, "losses/sft": 1.4494423866271973, "losses/total": 0.8616761565208435, "ref_logps/chosen": -30.735164642333984, "ref_logps/rejected": -36.314048767089844, "rewards/accuracies": 0.6875, "rewards/chosen": -1.2440688610076904, "rewards/margins": 0.6244890689849854, "rewards/rejected": -1.8685579299926758, "step": 74 }, { "epoch": 0.5660377358490566, "grad_norm": 5.264912128448486, "learning_rate": 3.9873417721518995e-06, "logps/chosen": -40.38694763183594, "logps/rejected": -57.54372024536133, "loss": 0.4474, "losses/dpo": 0.49264034628868103, "losses/sft": 1.4342858791351318, "losses/total": 0.49264034628868103, "ref_logps/chosen": -30.248275756835938, "ref_logps/rejected": -35.96062469482422, "rewards/accuracies": 0.8046875, "rewards/chosen": -1.0138667821884155, "rewards/margins": 1.1444426774978638, "rewards/rejected": -2.1583094596862793, "step": 75 }, { "epoch": 0.5735849056603773, "grad_norm": 5.621297836303711, "learning_rate": 3.9662447257383965e-06, "logps/chosen": -40.704254150390625, "logps/rejected": -54.67380905151367, "loss": 0.5208, "losses/dpo": 0.3526512086391449, "losses/sft": 1.205520749092102, "losses/total": 0.3526512086391449, "ref_logps/chosen": -30.080062866210938, "ref_logps/rejected": -35.63865661621094, "rewards/accuracies": 0.7265625, "rewards/chosen": -1.0624192953109741, "rewards/margins": 0.8410958647727966, "rewards/rejected": -1.903515100479126, "step": 76 }, { "epoch": 0.5811320754716981, "grad_norm": 5.440826416015625, "learning_rate": 3.945147679324895e-06, "logps/chosen": -38.52376174926758, "logps/rejected": -51.74415588378906, "loss": 0.5272, "losses/dpo": 0.43253982067108154, "losses/sft": 1.3709319829940796, "losses/total": 0.43253982067108154, "ref_logps/chosen": -29.032743453979492, "ref_logps/rejected": -34.911163330078125, "rewards/accuracies": 0.75, "rewards/chosen": -0.9491016268730164, "rewards/margins": 0.7341974377632141, "rewards/rejected": -1.68329918384552, "step": 77 }, { "epoch": 0.5886792452830188, "grad_norm": 4.8831377029418945, "learning_rate": 3.924050632911393e-06, "logps/chosen": -37.76980972290039, "logps/rejected": -57.21007537841797, "loss": 0.4277, "losses/dpo": 0.4043513536453247, "losses/sft": 1.341131329536438, "losses/total": 0.4043513536453247, "ref_logps/chosen": -29.5567684173584, "ref_logps/rejected": -37.932891845703125, "rewards/accuracies": 0.828125, "rewards/chosen": -0.8213039636611938, "rewards/margins": 1.1064141988754272, "rewards/rejected": -1.927718162536621, "step": 78 }, { "epoch": 0.5962264150943396, "grad_norm": 4.781276702880859, "learning_rate": 3.902953586497891e-06, "logps/chosen": -39.535728454589844, "logps/rejected": -55.912841796875, "loss": 0.4133, "losses/dpo": 0.22891421616077423, "losses/sft": 1.0599099397659302, "losses/total": 0.22891421616077423, "ref_logps/chosen": -30.658559799194336, "ref_logps/rejected": -35.42290115356445, "rewards/accuracies": 0.8359375, "rewards/chosen": -0.8877166509628296, "rewards/margins": 1.1612776517868042, "rewards/rejected": -2.048994302749634, "step": 79 }, { "epoch": 0.6037735849056604, "grad_norm": 5.942996501922607, "learning_rate": 3.8818565400843886e-06, "logps/chosen": -42.60219192504883, "logps/rejected": -59.31925582885742, "loss": 0.4876, "losses/dpo": 0.18772652745246887, "losses/sft": 1.6294547319412231, "losses/total": 0.18772652745246887, "ref_logps/chosen": -31.457292556762695, "ref_logps/rejected": -36.81336212158203, "rewards/accuracies": 0.7578125, "rewards/chosen": -1.114490032196045, "rewards/margins": 1.1360994577407837, "rewards/rejected": -2.250589370727539, "step": 80 }, { "epoch": 0.6113207547169811, "grad_norm": 6.1673102378845215, "learning_rate": 3.860759493670886e-06, "logps/chosen": -45.821449279785156, "logps/rejected": -57.79277420043945, "loss": 0.5474, "losses/dpo": 0.5481749773025513, "losses/sft": 1.9008748531341553, "losses/total": 0.5481749773025513, "ref_logps/chosen": -33.29500961303711, "ref_logps/rejected": -35.532379150390625, "rewards/accuracies": 0.734375, "rewards/chosen": -1.2526437044143677, "rewards/margins": 0.9733958840370178, "rewards/rejected": -2.2260396480560303, "step": 81 }, { "epoch": 0.6188679245283019, "grad_norm": 6.780162811279297, "learning_rate": 3.839662447257384e-06, "logps/chosen": -42.07637405395508, "logps/rejected": -53.19427490234375, "loss": 0.6369, "losses/dpo": 0.47502392530441284, "losses/sft": 1.4819515943527222, "losses/total": 0.47502392530441284, "ref_logps/chosen": -30.144392013549805, "ref_logps/rejected": -34.309967041015625, "rewards/accuracies": 0.6640625, "rewards/chosen": -1.193198323249817, "rewards/margins": 0.695232629776001, "rewards/rejected": -1.8884310722351074, "step": 82 }, { "epoch": 0.6264150943396226, "grad_norm": 6.206979751586914, "learning_rate": 3.818565400843882e-06, "logps/chosen": -41.30870056152344, "logps/rejected": -53.384010314941406, "loss": 0.5578, "losses/dpo": 0.7641366720199585, "losses/sft": 1.1807348728179932, "losses/total": 0.7641366720199585, "ref_logps/chosen": -30.49885368347168, "ref_logps/rejected": -34.54371643066406, "rewards/accuracies": 0.7421875, "rewards/chosen": -1.0809844732284546, "rewards/margins": 0.803044319152832, "rewards/rejected": -1.8840289115905762, "step": 83 }, { "epoch": 0.6339622641509434, "grad_norm": 6.179627418518066, "learning_rate": 3.7974683544303802e-06, "logps/chosen": -43.51046371459961, "logps/rejected": -57.6606330871582, "loss": 0.5393, "losses/dpo": 0.2461186796426773, "losses/sft": 1.4413666725158691, "losses/total": 0.2461186796426773, "ref_logps/chosen": -31.447532653808594, "ref_logps/rejected": -35.95327377319336, "rewards/accuracies": 0.71875, "rewards/chosen": -1.2062931060791016, "rewards/margins": 0.964443027973175, "rewards/rejected": -2.170736312866211, "step": 84 }, { "epoch": 0.6415094339622641, "grad_norm": 5.617598533630371, "learning_rate": 3.776371308016878e-06, "logps/chosen": -40.44061279296875, "logps/rejected": -58.36695098876953, "loss": 0.4637, "losses/dpo": 0.32039570808410645, "losses/sft": 1.308812141418457, "losses/total": 0.32039570808410645, "ref_logps/chosen": -30.45447540283203, "ref_logps/rejected": -36.56258010864258, "rewards/accuracies": 0.8203125, "rewards/chosen": -0.9986135363578796, "rewards/margins": 1.1818233728408813, "rewards/rejected": -2.180436849594116, "step": 85 }, { "epoch": 0.6490566037735849, "grad_norm": 6.853886604309082, "learning_rate": 3.755274261603376e-06, "logps/chosen": -42.52161407470703, "logps/rejected": -54.287841796875, "loss": 0.5991, "losses/dpo": 0.3581075370311737, "losses/sft": 1.5368932485580444, "losses/total": 0.3581075370311737, "ref_logps/chosen": -31.34189796447754, "ref_logps/rejected": -35.659629821777344, "rewards/accuracies": 0.671875, "rewards/chosen": -1.117971658706665, "rewards/margins": 0.7448497414588928, "rewards/rejected": -1.8628215789794922, "step": 86 }, { "epoch": 0.6566037735849056, "grad_norm": 6.692890167236328, "learning_rate": 3.7341772151898737e-06, "logps/chosen": -43.72019958496094, "logps/rejected": -58.138694763183594, "loss": 0.5519, "losses/dpo": 0.48806032538414, "losses/sft": 0.9289418458938599, "losses/total": 0.48806032538414, "ref_logps/chosen": -31.137048721313477, "ref_logps/rejected": -36.59523010253906, "rewards/accuracies": 0.75, "rewards/chosen": -1.2583153247833252, "rewards/margins": 0.8960307836532593, "rewards/rejected": -2.154345989227295, "step": 87 }, { "epoch": 0.6641509433962264, "grad_norm": 5.944717884063721, "learning_rate": 3.713080168776372e-06, "logps/chosen": -43.606327056884766, "logps/rejected": -54.38173294067383, "loss": 0.525, "losses/dpo": 0.4686363637447357, "losses/sft": 1.4131765365600586, "losses/total": 0.4686363637447357, "ref_logps/chosen": -32.12074279785156, "ref_logps/rejected": -35.17863464355469, "rewards/accuracies": 0.7421875, "rewards/chosen": -1.1485581398010254, "rewards/margins": 0.7717516422271729, "rewards/rejected": -1.9203099012374878, "step": 88 }, { "epoch": 0.6716981132075471, "grad_norm": 5.189677715301514, "learning_rate": 3.6919831223628693e-06, "logps/chosen": -41.43803787231445, "logps/rejected": -58.55611038208008, "loss": 0.4709, "losses/dpo": 0.27158284187316895, "losses/sft": 1.2001585960388184, "losses/total": 0.27158284187316895, "ref_logps/chosen": -30.10678482055664, "ref_logps/rejected": -36.89292907714844, "rewards/accuracies": 0.7578125, "rewards/chosen": -1.1331257820129395, "rewards/margins": 1.0331923961639404, "rewards/rejected": -2.16631817817688, "step": 89 }, { "epoch": 0.6792452830188679, "grad_norm": 5.98684024810791, "learning_rate": 3.6708860759493675e-06, "logps/chosen": -43.76606750488281, "logps/rejected": -52.34052276611328, "loss": 0.5686, "losses/dpo": 0.7345205545425415, "losses/sft": 1.5648329257965088, "losses/total": 0.7345205545425415, "ref_logps/chosen": -31.27000617980957, "ref_logps/rejected": -32.98979187011719, "rewards/accuracies": 0.71875, "rewards/chosen": -1.2496063709259033, "rewards/margins": 0.6854668855667114, "rewards/rejected": -1.9350732564926147, "step": 90 }, { "epoch": 0.6867924528301886, "grad_norm": 5.93943452835083, "learning_rate": 3.649789029535865e-06, "logps/chosen": -44.927276611328125, "logps/rejected": -56.36608123779297, "loss": 0.5478, "losses/dpo": 0.31034794449806213, "losses/sft": 1.5676113367080688, "losses/total": 0.31034794449806213, "ref_logps/chosen": -31.989208221435547, "ref_logps/rejected": -35.02646255493164, "rewards/accuracies": 0.75, "rewards/chosen": -1.2938066720962524, "rewards/margins": 0.8401551246643066, "rewards/rejected": -2.1339616775512695, "step": 91 }, { "epoch": 0.6943396226415094, "grad_norm": 5.51214075088501, "learning_rate": 3.628691983122363e-06, "logps/chosen": -44.41264343261719, "logps/rejected": -60.244510650634766, "loss": 0.4797, "losses/dpo": 0.42363566160202026, "losses/sft": 1.270397424697876, "losses/total": 0.42363566160202026, "ref_logps/chosen": -33.250125885009766, "ref_logps/rejected": -40.045772552490234, "rewards/accuracies": 0.75, "rewards/chosen": -1.1162512302398682, "rewards/margins": 0.9036226868629456, "rewards/rejected": -2.019874095916748, "step": 92 }, { "epoch": 0.7018867924528301, "grad_norm": 5.8951945304870605, "learning_rate": 3.607594936708861e-06, "logps/chosen": -41.33027648925781, "logps/rejected": -54.42854309082031, "loss": 0.5894, "losses/dpo": 0.31869563460350037, "losses/sft": 1.346259593963623, "losses/total": 0.31869563460350037, "ref_logps/chosen": -28.775463104248047, "ref_logps/rejected": -35.096275329589844, "rewards/accuracies": 0.703125, "rewards/chosen": -1.255481243133545, "rewards/margins": 0.6777457594871521, "rewards/rejected": -1.9332269430160522, "step": 93 }, { "epoch": 0.7094339622641509, "grad_norm": 6.491157531738281, "learning_rate": 3.586497890295359e-06, "logps/chosen": -43.51985168457031, "logps/rejected": -54.04357147216797, "loss": 0.5492, "losses/dpo": 0.6047073602676392, "losses/sft": 1.633582353591919, "losses/total": 0.6047073602676392, "ref_logps/chosen": -32.43757247924805, "ref_logps/rejected": -35.235679626464844, "rewards/accuracies": 0.6640625, "rewards/chosen": -1.1082279682159424, "rewards/margins": 0.7725614309310913, "rewards/rejected": -1.8807893991470337, "step": 94 }, { "epoch": 0.7169811320754716, "grad_norm": 5.706908702850342, "learning_rate": 3.5654008438818566e-06, "logps/chosen": -44.01639938354492, "logps/rejected": -56.538856506347656, "loss": 0.5286, "losses/dpo": 1.006984829902649, "losses/sft": 1.5528416633605957, "losses/total": 1.006984829902649, "ref_logps/chosen": -31.790197372436523, "ref_logps/rejected": -36.23029327392578, "rewards/accuracies": 0.7578125, "rewards/chosen": -1.2226204872131348, "rewards/margins": 0.8082359433174133, "rewards/rejected": -2.0308563709259033, "step": 95 }, { "epoch": 0.7245283018867924, "grad_norm": 5.353209018707275, "learning_rate": 3.544303797468355e-06, "logps/chosen": -39.303466796875, "logps/rejected": -55.11781311035156, "loss": 0.489, "losses/dpo": 0.3290543556213379, "losses/sft": 1.5605956315994263, "losses/total": 0.3290543556213379, "ref_logps/chosen": -30.248308181762695, "ref_logps/rejected": -37.410919189453125, "rewards/accuracies": 0.765625, "rewards/chosen": -0.9055157899856567, "rewards/margins": 0.8651739358901978, "rewards/rejected": -1.770689606666565, "step": 96 }, { "epoch": 0.7320754716981132, "grad_norm": 5.129034996032715, "learning_rate": 3.523206751054853e-06, "logps/chosen": -39.52154541015625, "logps/rejected": -57.900733947753906, "loss": 0.4432, "losses/dpo": 0.5874547958374023, "losses/sft": 2.055988073348999, "losses/total": 0.5874547958374023, "ref_logps/chosen": -30.57717514038086, "ref_logps/rejected": -37.92079544067383, "rewards/accuracies": 0.828125, "rewards/chosen": -0.8944366574287415, "rewards/margins": 1.1035571098327637, "rewards/rejected": -1.99799382686615, "step": 97 }, { "epoch": 0.7396226415094339, "grad_norm": 5.658941745758057, "learning_rate": 3.5021097046413504e-06, "logps/chosen": -42.83544158935547, "logps/rejected": -51.84044647216797, "loss": 0.5452, "losses/dpo": 0.711673378944397, "losses/sft": 1.195967674255371, "losses/total": 0.711673378944397, "ref_logps/chosen": -32.588661193847656, "ref_logps/rejected": -34.962364196777344, "rewards/accuracies": 0.7734375, "rewards/chosen": -1.0246777534484863, "rewards/margins": 0.6631301641464233, "rewards/rejected": -1.6878077983856201, "step": 98 }, { "epoch": 0.7471698113207547, "grad_norm": 5.521385669708252, "learning_rate": 3.4810126582278487e-06, "logps/chosen": -38.521541595458984, "logps/rejected": -50.79652786254883, "loss": 0.5304, "losses/dpo": 0.3267762064933777, "losses/sft": 1.4650697708129883, "losses/total": 0.3267762064933777, "ref_logps/chosen": -28.887462615966797, "ref_logps/rejected": -33.21889877319336, "rewards/accuracies": 0.71875, "rewards/chosen": -0.9634078741073608, "rewards/margins": 0.7943546772003174, "rewards/rejected": -1.7577626705169678, "step": 99 }, { "epoch": 0.7547169811320755, "grad_norm": 6.092041492462158, "learning_rate": 3.459915611814346e-06, "logps/chosen": -43.435054779052734, "logps/rejected": -48.984378814697266, "loss": 0.6331, "losses/dpo": 0.2698668837547302, "losses/sft": 1.1696505546569824, "losses/total": 0.2698668837547302, "ref_logps/chosen": -31.87053871154785, "ref_logps/rejected": -31.63926887512207, "rewards/accuracies": 0.6953125, "rewards/chosen": -1.1564514636993408, "rewards/margins": 0.5780597925186157, "rewards/rejected": -1.734511375427246, "step": 100 }, { "epoch": 0.7622641509433963, "grad_norm": 5.298862934112549, "learning_rate": 3.4388185654008443e-06, "logps/chosen": -38.33399200439453, "logps/rejected": -53.81504821777344, "loss": 0.478, "losses/dpo": 0.4309498071670532, "losses/sft": 1.7226063013076782, "losses/total": 0.4309498071670532, "ref_logps/chosen": -29.21530532836914, "ref_logps/rejected": -36.157432556152344, "rewards/accuracies": 0.7890625, "rewards/chosen": -0.9118687510490417, "rewards/margins": 0.8538926839828491, "rewards/rejected": -1.765761375427246, "step": 101 }, { "epoch": 0.769811320754717, "grad_norm": 5.454709529876709, "learning_rate": 3.417721518987342e-06, "logps/chosen": -36.00552749633789, "logps/rejected": -47.35252380371094, "loss": 0.5585, "losses/dpo": 0.3503888249397278, "losses/sft": 1.201081395149231, "losses/total": 0.3503888249397278, "ref_logps/chosen": -27.280101776123047, "ref_logps/rejected": -31.31577491760254, "rewards/accuracies": 0.75, "rewards/chosen": -0.8725426197052002, "rewards/margins": 0.7311323881149292, "rewards/rejected": -1.6036750078201294, "step": 102 }, { "epoch": 0.7773584905660378, "grad_norm": 5.91898250579834, "learning_rate": 3.39662447257384e-06, "logps/chosen": -41.72618103027344, "logps/rejected": -53.16327667236328, "loss": 0.5765, "losses/dpo": 0.3589654564857483, "losses/sft": 1.56475031375885, "losses/total": 0.3589654564857483, "ref_logps/chosen": -30.899045944213867, "ref_logps/rejected": -34.665802001953125, "rewards/accuracies": 0.6875, "rewards/chosen": -1.0827136039733887, "rewards/margins": 0.7670339941978455, "rewards/rejected": -1.8497475385665894, "step": 103 }, { "epoch": 0.7849056603773585, "grad_norm": 5.455533981323242, "learning_rate": 3.3755274261603377e-06, "logps/chosen": -41.35075378417969, "logps/rejected": -52.81592559814453, "loss": 0.5401, "losses/dpo": 0.39715278148651123, "losses/sft": 1.4784698486328125, "losses/total": 0.39715278148651123, "ref_logps/chosen": -30.155914306640625, "ref_logps/rejected": -34.45831298828125, "rewards/accuracies": 0.734375, "rewards/chosen": -1.1194841861724854, "rewards/margins": 0.7162774801254272, "rewards/rejected": -1.835761547088623, "step": 104 }, { "epoch": 0.7924528301886793, "grad_norm": 4.831524848937988, "learning_rate": 3.354430379746836e-06, "logps/chosen": -38.7393684387207, "logps/rejected": -53.984886169433594, "loss": 0.4876, "losses/dpo": 0.5145635604858398, "losses/sft": 1.3864595890045166, "losses/total": 0.5145635604858398, "ref_logps/chosen": -27.76490020751953, "ref_logps/rejected": -34.16275405883789, "rewards/accuracies": 0.7109375, "rewards/chosen": -1.0974467992782593, "rewards/margins": 0.8847663402557373, "rewards/rejected": -1.9822131395339966, "step": 105 }, { "epoch": 0.8, "grad_norm": 4.85020637512207, "learning_rate": 3.3333333333333333e-06, "logps/chosen": -34.77644729614258, "logps/rejected": -51.18222427368164, "loss": 0.5472, "losses/dpo": 0.6919156908988953, "losses/sft": 1.51556396484375, "losses/total": 0.6919156908988953, "ref_logps/chosen": -24.479114532470703, "ref_logps/rejected": -33.03002166748047, "rewards/accuracies": 0.703125, "rewards/chosen": -1.0297331809997559, "rewards/margins": 0.785487174987793, "rewards/rejected": -1.8152204751968384, "step": 106 }, { "epoch": 0.8075471698113208, "grad_norm": 5.955616474151611, "learning_rate": 3.3122362869198316e-06, "logps/chosen": -43.87641143798828, "logps/rejected": -54.51424789428711, "loss": 0.6114, "losses/dpo": 0.756654679775238, "losses/sft": 1.6842637062072754, "losses/total": 0.756654679775238, "ref_logps/chosen": -31.69654083251953, "ref_logps/rejected": -36.67472839355469, "rewards/accuracies": 0.734375, "rewards/chosen": -1.217987060546875, "rewards/margins": 0.565964937210083, "rewards/rejected": -1.7839521169662476, "step": 107 }, { "epoch": 0.8150943396226416, "grad_norm": 5.920444965362549, "learning_rate": 3.2911392405063294e-06, "logps/chosen": -40.982208251953125, "logps/rejected": -52.83149337768555, "loss": 0.5517, "losses/dpo": 0.65492182970047, "losses/sft": 1.6355092525482178, "losses/total": 0.65492182970047, "ref_logps/chosen": -29.412181854248047, "ref_logps/rejected": -34.126094818115234, "rewards/accuracies": 0.6796875, "rewards/chosen": -1.157002329826355, "rewards/margins": 0.7135379910469055, "rewards/rejected": -1.8705402612686157, "step": 108 }, { "epoch": 0.8226415094339623, "grad_norm": 5.697230339050293, "learning_rate": 3.270042194092827e-06, "logps/chosen": -42.04249954223633, "logps/rejected": -56.36012268066406, "loss": 0.5426, "losses/dpo": 0.3828045725822449, "losses/sft": 1.5319883823394775, "losses/total": 0.3828045725822449, "ref_logps/chosen": -31.600698471069336, "ref_logps/rejected": -36.47235107421875, "rewards/accuracies": 0.7265625, "rewards/chosen": -1.0441800355911255, "rewards/margins": 0.9445971846580505, "rewards/rejected": -1.9887771606445312, "step": 109 }, { "epoch": 0.8301886792452831, "grad_norm": 6.10832405090332, "learning_rate": 3.248945147679325e-06, "logps/chosen": -39.39381408691406, "logps/rejected": -50.33586120605469, "loss": 0.6518, "losses/dpo": 0.5053750872612, "losses/sft": 1.3503327369689941, "losses/total": 0.5053750872612, "ref_logps/chosen": -27.708271026611328, "ref_logps/rejected": -33.47583770751953, "rewards/accuracies": 0.640625, "rewards/chosen": -1.1685543060302734, "rewards/margins": 0.5174481272697449, "rewards/rejected": -1.6860023736953735, "step": 110 }, { "epoch": 0.8377358490566038, "grad_norm": 5.662174224853516, "learning_rate": 3.2278481012658232e-06, "logps/chosen": -41.917667388916016, "logps/rejected": -55.30170440673828, "loss": 0.5537, "losses/dpo": 0.9183385372161865, "losses/sft": 1.6308963298797607, "losses/total": 0.9183385372161865, "ref_logps/chosen": -29.052471160888672, "ref_logps/rejected": -34.99309539794922, "rewards/accuracies": 0.7109375, "rewards/chosen": -1.2865197658538818, "rewards/margins": 0.7443405985832214, "rewards/rejected": -2.030860424041748, "step": 111 }, { "epoch": 0.8452830188679246, "grad_norm": 5.386070728302002, "learning_rate": 3.206751054852321e-06, "logps/chosen": -43.091156005859375, "logps/rejected": -57.210243225097656, "loss": 0.5028, "losses/dpo": 0.3924194872379303, "losses/sft": 1.2852116823196411, "losses/total": 0.3924194872379303, "ref_logps/chosen": -30.833717346191406, "ref_logps/rejected": -36.638153076171875, "rewards/accuracies": 0.78125, "rewards/chosen": -1.2257441282272339, "rewards/margins": 0.8314655423164368, "rewards/rejected": -2.0572097301483154, "step": 112 }, { "epoch": 0.8528301886792453, "grad_norm": 6.078062057495117, "learning_rate": 3.185654008438819e-06, "logps/chosen": -44.7150764465332, "logps/rejected": -60.601261138916016, "loss": 0.5275, "losses/dpo": 0.3586404323577881, "losses/sft": 1.5925896167755127, "losses/total": 0.3586404323577881, "ref_logps/chosen": -31.377941131591797, "ref_logps/rejected": -38.768470764160156, "rewards/accuracies": 0.7265625, "rewards/chosen": -1.3337135314941406, "rewards/margins": 0.8495657444000244, "rewards/rejected": -2.183279037475586, "step": 113 }, { "epoch": 0.8603773584905661, "grad_norm": 5.526185035705566, "learning_rate": 3.164556962025317e-06, "logps/chosen": -42.03477478027344, "logps/rejected": -54.82069778442383, "loss": 0.5588, "losses/dpo": 1.1230486631393433, "losses/sft": 1.6963893175125122, "losses/total": 1.1230486631393433, "ref_logps/chosen": -28.538129806518555, "ref_logps/rejected": -33.739173889160156, "rewards/accuracies": 0.71875, "rewards/chosen": -1.3496648073196411, "rewards/margins": 0.75848788022995, "rewards/rejected": -2.1081528663635254, "step": 114 }, { "epoch": 0.8679245283018868, "grad_norm": 5.497169017791748, "learning_rate": 3.1434599156118145e-06, "logps/chosen": -42.89957046508789, "logps/rejected": -57.447914123535156, "loss": 0.523, "losses/dpo": 0.7913863658905029, "losses/sft": 1.356689453125, "losses/total": 0.7913863658905029, "ref_logps/chosen": -30.25906753540039, "ref_logps/rejected": -36.00640869140625, "rewards/accuracies": 0.78125, "rewards/chosen": -1.2640503644943237, "rewards/margins": 0.8801001906394958, "rewards/rejected": -2.144150733947754, "step": 115 }, { "epoch": 0.8754716981132076, "grad_norm": 5.6127824783325195, "learning_rate": 3.1223628691983127e-06, "logps/chosen": -46.81511688232422, "logps/rejected": -56.43955993652344, "loss": 0.5401, "losses/dpo": 0.7304984927177429, "losses/sft": 1.7529406547546387, "losses/total": 0.7304984927177429, "ref_logps/chosen": -32.37680435180664, "ref_logps/rejected": -34.742347717285156, "rewards/accuracies": 0.7421875, "rewards/chosen": -1.4438307285308838, "rewards/margins": 0.7258903980255127, "rewards/rejected": -2.1697211265563965, "step": 116 }, { "epoch": 0.8830188679245283, "grad_norm": 6.019126892089844, "learning_rate": 3.10126582278481e-06, "logps/chosen": -44.45075988769531, "logps/rejected": -56.06095886230469, "loss": 0.5493, "losses/dpo": 0.5076150298118591, "losses/sft": 1.855008602142334, "losses/total": 0.5076150298118591, "ref_logps/chosen": -29.896549224853516, "ref_logps/rejected": -33.72525405883789, "rewards/accuracies": 0.65625, "rewards/chosen": -1.4554212093353271, "rewards/margins": 0.7781496644020081, "rewards/rejected": -2.2335708141326904, "step": 117 }, { "epoch": 0.8905660377358491, "grad_norm": 4.897617816925049, "learning_rate": 3.0801687763713083e-06, "logps/chosen": -44.17365264892578, "logps/rejected": -59.890846252441406, "loss": 0.4334, "losses/dpo": 0.41634368896484375, "losses/sft": 1.3092498779296875, "losses/total": 0.41634368896484375, "ref_logps/chosen": -30.777435302734375, "ref_logps/rejected": -36.164852142333984, "rewards/accuracies": 0.859375, "rewards/chosen": -1.3396217823028564, "rewards/margins": 1.0329774618148804, "rewards/rejected": -2.3725991249084473, "step": 118 }, { "epoch": 0.8981132075471698, "grad_norm": 5.326719760894775, "learning_rate": 3.059071729957806e-06, "logps/chosen": -41.951332092285156, "logps/rejected": -55.92094039916992, "loss": 0.4896, "losses/dpo": 0.36023202538490295, "losses/sft": 1.5721536874771118, "losses/total": 0.36023202538490295, "ref_logps/chosen": -29.51433563232422, "ref_logps/rejected": -34.74650573730469, "rewards/accuracies": 0.796875, "rewards/chosen": -1.2436996698379517, "rewards/margins": 0.8737441301345825, "rewards/rejected": -2.1174440383911133, "step": 119 }, { "epoch": 0.9056603773584906, "grad_norm": 5.39622688293457, "learning_rate": 3.037974683544304e-06, "logps/chosen": -42.2480354309082, "logps/rejected": -58.74956512451172, "loss": 0.51, "losses/dpo": 0.6462827920913696, "losses/sft": 1.5755528211593628, "losses/total": 0.6462827920913696, "ref_logps/chosen": -28.274633407592773, "ref_logps/rejected": -35.56834411621094, "rewards/accuracies": 0.7421875, "rewards/chosen": -1.397340178489685, "rewards/margins": 0.9207824468612671, "rewards/rejected": -2.318122625350952, "step": 120 } ], "logging_steps": 1.0, "max_steps": 264, "num_input_tokens_seen": 0, "num_train_epochs": 2, "save_steps": 40, "total_flos": 0.0, "train_batch_size": 2, "trial_name": null, "trial_params": null }