{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 100, "global_step": 1245, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0, "grad_norm": 0.275390625, "learning_rate": 4e-09, "logits/chosen": -1.7618920803070068, "logits/rejected": -1.9108173847198486, "logps/chosen": -157.78750610351562, "logps/rejected": -289.7099609375, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/margins_max": 0.0, "rewards/margins_min": 0.0, "rewards/margins_std": 0.0, "rewards/rejected": 0.0, "step": 1 }, { "epoch": 0.01, "grad_norm": 0.29296875, "learning_rate": 4e-08, "logits/chosen": -1.51752507686615, "logits/rejected": -1.070033311843872, "logps/chosen": -249.508544921875, "logps/rejected": -263.5408935546875, "loss": 0.693, "rewards/accuracies": 0.4722222089767456, "rewards/chosen": 0.0005126031464897096, "rewards/margins": 0.0002764479722827673, "rewards/margins_max": 0.0027176812291145325, "rewards/margins_min": -0.0021647855173796415, "rewards/margins_std": 0.003452425356954336, "rewards/rejected": 0.00023615510144736618, "step": 10 }, { "epoch": 0.02, "grad_norm": 0.326171875, "learning_rate": 8e-08, "logits/chosen": -1.4727632999420166, "logits/rejected": -0.8024684190750122, "logps/chosen": -246.8076629638672, "logps/rejected": -232.8890380859375, "loss": 0.6929, "rewards/accuracies": 0.550000011920929, "rewards/chosen": 0.0010415544966235757, "rewards/margins": 0.0009085159981623292, "rewards/margins_max": 0.0034356764517724514, "rewards/margins_min": -0.0016186445718631148, "rewards/margins_std": 0.003573944792151451, "rewards/rejected": 0.00013303852756507695, "step": 20 }, { "epoch": 0.02, "grad_norm": 0.30859375, "learning_rate": 1.2e-07, "logits/chosen": -1.546022891998291, "logits/rejected": -0.9032249450683594, "logps/chosen": -295.8550109863281, "logps/rejected": -262.56494140625, "loss": 0.6927, "rewards/accuracies": 0.574999988079071, "rewards/chosen": 0.0009819145780056715, "rewards/margins": 0.0008549405029043555, "rewards/margins_max": 0.0037618372589349747, "rewards/margins_min": -0.00205195602029562, "rewards/margins_std": 0.0041109723970294, "rewards/rejected": 0.00012697407510131598, "step": 30 }, { "epoch": 0.03, "grad_norm": 0.384765625, "learning_rate": 1.6e-07, "logits/chosen": -1.3732502460479736, "logits/rejected": -0.8910134434700012, "logps/chosen": -279.44305419921875, "logps/rejected": -260.0339050292969, "loss": 0.6929, "rewards/accuracies": 0.550000011920929, "rewards/chosen": 0.0004959392827004194, "rewards/margins": 4.269594865036197e-05, "rewards/margins_max": 0.0025075911544263363, "rewards/margins_min": -0.0024221991188824177, "rewards/margins_std": 0.0034858882427215576, "rewards/rejected": 0.0004532432067207992, "step": 40 }, { "epoch": 0.04, "grad_norm": 0.318359375, "learning_rate": 2e-07, "logits/chosen": -1.3462735414505005, "logits/rejected": -0.9689818620681763, "logps/chosen": -294.2067565917969, "logps/rejected": -267.61468505859375, "loss": 0.6927, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": 0.0006406746106222272, "rewards/margins": 4.840875135414535e-06, "rewards/margins_max": 0.002171821426600218, "rewards/margins_min": -0.002162139629945159, "rewards/margins_std": 0.00306457350961864, "rewards/rejected": 0.0006358337705023587, "step": 50 }, { "epoch": 0.05, "grad_norm": 0.2578125, "learning_rate": 2.4e-07, "logits/chosen": -1.3812487125396729, "logits/rejected": -0.920501708984375, "logps/chosen": -255.908935546875, "logps/rejected": -254.52267456054688, "loss": 0.6925, "rewards/accuracies": 0.574999988079071, "rewards/chosen": 0.0007308960193768144, "rewards/margins": 0.000600042287260294, "rewards/margins_max": 0.002956463024020195, "rewards/margins_min": -0.0017563781002536416, "rewards/margins_std": 0.0033324819523841143, "rewards/rejected": 0.00013085365935694426, "step": 60 }, { "epoch": 0.06, "grad_norm": 0.29296875, "learning_rate": 2.8e-07, "logits/chosen": -1.3476909399032593, "logits/rejected": -1.0504162311553955, "logps/chosen": -215.3407440185547, "logps/rejected": -243.213623046875, "loss": 0.6926, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 0.001206626882776618, "rewards/margins": 0.0011660498566925526, "rewards/margins_max": 0.0037520560435950756, "rewards/margins_min": -0.0014199562137946486, "rewards/margins_std": 0.00365716521628201, "rewards/rejected": 4.0576916944701225e-05, "step": 70 }, { "epoch": 0.06, "grad_norm": 0.30078125, "learning_rate": 3.2e-07, "logits/chosen": -1.4696115255355835, "logits/rejected": -0.9919592142105103, "logps/chosen": -205.0357208251953, "logps/rejected": -192.74093627929688, "loss": 0.6918, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 0.0031056287698447704, "rewards/margins": 0.0026954771019518375, "rewards/margins_max": 0.005038493778556585, "rewards/margins_min": 0.00035246083280071616, "rewards/margins_std": 0.0033135253470391035, "rewards/rejected": 0.0004101515223737806, "step": 80 }, { "epoch": 0.07, "grad_norm": 0.296875, "learning_rate": 3.6e-07, "logits/chosen": -1.5339497327804565, "logits/rejected": -0.9177694320678711, "logps/chosen": -330.52783203125, "logps/rejected": -279.0771484375, "loss": 0.6915, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": 0.004971296060830355, "rewards/margins": 0.00472742784768343, "rewards/margins_max": 0.007729808334261179, "rewards/margins_min": 0.0017250461969524622, "rewards/margins_std": 0.0042460085824131966, "rewards/rejected": 0.00024386882432736456, "step": 90 }, { "epoch": 0.08, "grad_norm": 0.25, "learning_rate": 4e-07, "logits/chosen": -1.4959418773651123, "logits/rejected": -1.0725454092025757, "logps/chosen": -245.35696411132812, "logps/rejected": -239.76806640625, "loss": 0.691, "rewards/accuracies": 0.675000011920929, "rewards/chosen": 0.004789863713085651, "rewards/margins": 0.004364717286080122, "rewards/margins_max": 0.007768542971462011, "rewards/margins_min": 0.0009608917171135545, "rewards/margins_std": 0.004813736770302057, "rewards/rejected": 0.0004251461068633944, "step": 100 }, { "epoch": 0.09, "grad_norm": 0.30078125, "learning_rate": 4.3999999999999997e-07, "logits/chosen": -1.457798719406128, "logits/rejected": -1.0094877481460571, "logps/chosen": -215.407958984375, "logps/rejected": -233.8264617919922, "loss": 0.6908, "rewards/accuracies": 0.824999988079071, "rewards/chosen": 0.004960652906447649, "rewards/margins": 0.004745746962726116, "rewards/margins_max": 0.009032508358359337, "rewards/margins_min": 0.00045898626558482647, "rewards/margins_std": 0.006062395870685577, "rewards/rejected": 0.00021490575454663485, "step": 110 }, { "epoch": 0.1, "grad_norm": 0.267578125, "learning_rate": 4.8e-07, "logits/chosen": -1.2908645868301392, "logits/rejected": -0.8783978223800659, "logps/chosen": -232.7576904296875, "logps/rejected": -225.4801483154297, "loss": 0.6909, "rewards/accuracies": 0.824999988079071, "rewards/chosen": 0.005561050958931446, "rewards/margins": 0.004983147140592337, "rewards/margins_max": 0.009352527558803558, "rewards/margins_min": 0.0006137675372883677, "rewards/margins_std": 0.006179235875606537, "rewards/rejected": 0.0005779037601314485, "step": 120 }, { "epoch": 0.1, "grad_norm": 0.33984375, "learning_rate": 4.999754129398937e-07, "logits/chosen": -1.5997236967086792, "logits/rejected": -1.0487782955169678, "logps/chosen": -266.701904296875, "logps/rejected": -220.42861938476562, "loss": 0.6901, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 0.00810263305902481, "rewards/margins": 0.007168160285800695, "rewards/margins_max": 0.01202232763171196, "rewards/margins_min": 0.0023139934055507183, "rewards/margins_std": 0.006864829454571009, "rewards/rejected": 0.0009344721329398453, "step": 130 }, { "epoch": 0.11, "grad_norm": 0.259765625, "learning_rate": 4.997787454752217e-07, "logits/chosen": -1.4701694250106812, "logits/rejected": -0.9617838859558105, "logps/chosen": -246.3208465576172, "logps/rejected": -189.7033233642578, "loss": 0.6894, "rewards/accuracies": 0.75, "rewards/chosen": 0.00780236953869462, "rewards/margins": 0.00573467742651701, "rewards/margins_max": 0.010288970544934273, "rewards/margins_min": 0.001180383493192494, "rewards/margins_std": 0.006440743803977966, "rewards/rejected": 0.0020676928106695414, "step": 140 }, { "epoch": 0.12, "grad_norm": 0.36328125, "learning_rate": 4.993855652734615e-07, "logits/chosen": -1.458670973777771, "logits/rejected": -0.9349568486213684, "logps/chosen": -236.487060546875, "logps/rejected": -212.31546020507812, "loss": 0.6897, "rewards/accuracies": 0.875, "rewards/chosen": 0.009739547967910767, "rewards/margins": 0.0075815594755113125, "rewards/margins_max": 0.011967300437390804, "rewards/margins_min": 0.0031958178151398897, "rewards/margins_std": 0.006202374584972858, "rewards/rejected": 0.002157988492399454, "step": 150 }, { "epoch": 0.13, "grad_norm": 0.326171875, "learning_rate": 4.987961816680492e-07, "logits/chosen": -1.2137949466705322, "logits/rejected": -0.9049445986747742, "logps/chosen": -187.77523803710938, "logps/rejected": -187.45571899414062, "loss": 0.6886, "rewards/accuracies": 0.875, "rewards/chosen": 0.008446315303444862, "rewards/margins": 0.008639861829578876, "rewards/margins_max": 0.014706149697303772, "rewards/margins_min": 0.002573573961853981, "rewards/margins_std": 0.008579026907682419, "rewards/rejected": -0.00019354629330337048, "step": 160 }, { "epoch": 0.14, "grad_norm": 0.294921875, "learning_rate": 4.980110583549062e-07, "logits/chosen": -1.4705772399902344, "logits/rejected": -0.8809791803359985, "logps/chosen": -255.54696655273438, "logps/rejected": -227.39797973632812, "loss": 0.6871, "rewards/accuracies": 0.875, "rewards/chosen": 0.01453929953277111, "rewards/margins": 0.013537155464291573, "rewards/margins_max": 0.02139822207391262, "rewards/margins_min": 0.005676089785993099, "rewards/margins_std": 0.011117227375507355, "rewards/rejected": 0.0010021438356488943, "step": 170 }, { "epoch": 0.14, "grad_norm": 0.2890625, "learning_rate": 4.970308130276272e-07, "logits/chosen": -1.4390299320220947, "logits/rejected": -0.9981800317764282, "logps/chosen": -235.2798309326172, "logps/rejected": -246.27188110351562, "loss": 0.6872, "rewards/accuracies": 0.875, "rewards/chosen": 0.0153080178424716, "rewards/margins": 0.013791908510029316, "rewards/margins_max": 0.022898811846971512, "rewards/margins_min": 0.004685004707425833, "rewards/margins_std": 0.012879105284810066, "rewards/rejected": 0.0015161095652729273, "step": 180 }, { "epoch": 0.15, "grad_norm": 0.353515625, "learning_rate": 4.958562168915121e-07, "logits/chosen": -1.5204397439956665, "logits/rejected": -0.9391329884529114, "logps/chosen": -322.778076171875, "logps/rejected": -321.51885986328125, "loss": 0.6859, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 0.018421044573187828, "rewards/margins": 0.01598033681511879, "rewards/margins_max": 0.024204885587096214, "rewards/margins_min": 0.007755786180496216, "rewards/margins_std": 0.011631269007921219, "rewards/rejected": 0.0024407082237303257, "step": 190 }, { "epoch": 0.16, "grad_norm": 0.30859375, "learning_rate": 4.944881940568219e-07, "logits/chosen": -1.4054844379425049, "logits/rejected": -0.8457719683647156, "logps/chosen": -260.04595947265625, "logps/rejected": -212.5659942626953, "loss": 0.6872, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": 0.016095256432890892, "rewards/margins": 0.011531388387084007, "rewards/margins_max": 0.018262848258018494, "rewards/margins_min": 0.0047999280504882336, "rewards/margins_std": 0.009519720450043678, "rewards/rejected": 0.004563868977129459, "step": 200 }, { "epoch": 0.17, "grad_norm": 0.296875, "learning_rate": 4.929278208117377e-07, "logits/chosen": -1.3484306335449219, "logits/rejected": -0.7912663817405701, "logps/chosen": -272.6092529296875, "logps/rejected": -242.0840301513672, "loss": 0.6843, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 0.017031192779541016, "rewards/margins": 0.014723042026162148, "rewards/margins_max": 0.024344168603420258, "rewards/margins_min": 0.005101913120597601, "rewards/margins_std": 0.013606329448521137, "rewards/rejected": 0.0023081512190401554, "step": 210 }, { "epoch": 0.18, "grad_norm": 0.318359375, "learning_rate": 4.911763247755939e-07, "logits/chosen": -1.4486370086669922, "logits/rejected": -1.0804178714752197, "logps/chosen": -222.94155883789062, "logps/rejected": -206.1028289794922, "loss": 0.6854, "rewards/accuracies": 0.875, "rewards/chosen": 0.0198357030749321, "rewards/margins": 0.014332096092402935, "rewards/margins_max": 0.022534403949975967, "rewards/margins_min": 0.006129787303507328, "rewards/margins_std": 0.01159981545060873, "rewards/rejected": 0.005503608379513025, "step": 220 }, { "epoch": 0.18, "grad_norm": 0.36328125, "learning_rate": 4.892350839330522e-07, "logits/chosen": -1.414548635482788, "logits/rejected": -0.9870842695236206, "logps/chosen": -278.77398681640625, "logps/rejected": -285.62225341796875, "loss": 0.6842, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 0.017949718981981277, "rewards/margins": 0.016461949795484543, "rewards/margins_max": 0.026643777266144753, "rewards/margins_min": 0.0062801227904856205, "rewards/margins_std": 0.014399275183677673, "rewards/rejected": 0.0014877680223435163, "step": 230 }, { "epoch": 0.19, "grad_norm": 0.380859375, "learning_rate": 4.871056255499757e-07, "logits/chosen": -1.5397964715957642, "logits/rejected": -1.0166374444961548, "logps/chosen": -277.46807861328125, "logps/rejected": -256.5303955078125, "loss": 0.683, "rewards/accuracies": 0.875, "rewards/chosen": 0.024707380682229996, "rewards/margins": 0.02101754955947399, "rewards/margins_max": 0.03346977010369301, "rewards/margins_min": 0.008565334603190422, "rewards/margins_std": 0.01761009357869625, "rewards/rejected": 0.003689829260110855, "step": 240 }, { "epoch": 0.2, "grad_norm": 0.26171875, "learning_rate": 4.84789624971857e-07, "logits/chosen": -1.2661800384521484, "logits/rejected": -0.9563673138618469, "logps/chosen": -218.5209197998047, "logps/rejected": -195.1947021484375, "loss": 0.6835, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 0.02002892456948757, "rewards/margins": 0.019926266744732857, "rewards/margins_max": 0.033761389553546906, "rewards/margins_min": 0.006091143935918808, "rewards/margins_std": 0.019565818831324577, "rewards/rejected": 0.00010265726450597867, "step": 250 }, { "epoch": 0.21, "grad_norm": 0.37890625, "learning_rate": 4.822889043057445e-07, "logits/chosen": -1.4784233570098877, "logits/rejected": -1.0394175052642822, "logps/chosen": -224.0451202392578, "logps/rejected": -250.0937957763672, "loss": 0.6828, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": 0.02225142903625965, "rewards/margins": 0.02364351972937584, "rewards/margins_max": 0.03730713203549385, "rewards/margins_min": 0.009979905560612679, "rewards/margins_std": 0.01932326704263687, "rewards/rejected": -0.001392088714055717, "step": 260 }, { "epoch": 0.22, "grad_norm": 0.25, "learning_rate": 4.796054309867052e-07, "logits/chosen": -1.466679334640503, "logits/rejected": -1.0491148233413696, "logps/chosen": -252.1529998779297, "logps/rejected": -196.3572540283203, "loss": 0.6835, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": 0.02002880536019802, "rewards/margins": 0.01566789671778679, "rewards/margins_max": 0.024082649499177933, "rewards/margins_min": 0.007253146730363369, "rewards/margins_std": 0.01190025545656681, "rewards/rejected": 0.004360905848443508, "step": 270 }, { "epoch": 0.22, "grad_norm": 0.310546875, "learning_rate": 4.7674131622995004e-07, "logits/chosen": -1.4919434785842896, "logits/rejected": -1.1262288093566895, "logps/chosen": -229.0741729736328, "logps/rejected": -239.2893524169922, "loss": 0.6825, "rewards/accuracies": 0.875, "rewards/chosen": 0.021708309650421143, "rewards/margins": 0.0214251521974802, "rewards/margins_max": 0.03245983272790909, "rewards/margins_min": 0.010390473529696465, "rewards/margins_std": 0.01560539286583662, "rewards/rejected": 0.00028315745294094086, "step": 280 }, { "epoch": 0.23, "grad_norm": 0.259765625, "learning_rate": 4.736988133698415e-07, "logits/chosen": -1.36636221408844, "logits/rejected": -1.0097697973251343, "logps/chosen": -236.0571746826172, "logps/rejected": -290.71966552734375, "loss": 0.6812, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 0.0214462261646986, "rewards/margins": 0.024356219917535782, "rewards/margins_max": 0.03478908911347389, "rewards/margins_min": 0.01392335630953312, "rewards/margins_std": 0.014754298143088818, "rewards/rejected": -0.0029099967796355486, "step": 290 }, { "epoch": 0.24, "grad_norm": 0.3359375, "learning_rate": 4.704803160870887e-07, "logits/chosen": -1.359491229057312, "logits/rejected": -0.9403274655342102, "logps/chosen": -336.4783630371094, "logps/rejected": -255.6852569580078, "loss": 0.6784, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 0.03255495801568031, "rewards/margins": 0.033007461577653885, "rewards/margins_max": 0.050988949835300446, "rewards/margins_min": 0.015025977976620197, "rewards/margins_std": 0.025429660454392433, "rewards/rejected": -0.00045250533730722964, "step": 300 }, { "epoch": 0.25, "grad_norm": 0.3125, "learning_rate": 4.6708835652552635e-07, "logits/chosen": -1.3997482061386108, "logits/rejected": -0.9649599194526672, "logps/chosen": -279.89093017578125, "logps/rejected": -223.64846801757812, "loss": 0.6808, "rewards/accuracies": 0.824999988079071, "rewards/chosen": 0.02754550240933895, "rewards/margins": 0.026410916820168495, "rewards/margins_max": 0.041250623762607574, "rewards/margins_min": 0.011571208015084267, "rewards/margins_std": 0.020986516028642654, "rewards/rejected": 0.0011345893144607544, "step": 310 }, { "epoch": 0.26, "grad_norm": 0.345703125, "learning_rate": 4.635256032999568e-07, "logits/chosen": -1.4304898977279663, "logits/rejected": -1.0653866529464722, "logps/chosen": -234.1278839111328, "logps/rejected": -221.0845184326172, "loss": 0.6783, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 0.02343001775443554, "rewards/margins": 0.027745971456170082, "rewards/margins_max": 0.04254238307476044, "rewards/margins_min": 0.012949560768902302, "rewards/margins_std": 0.020925285294651985, "rewards/rejected": -0.004315950442105532, "step": 320 }, { "epoch": 0.27, "grad_norm": 0.33984375, "learning_rate": 4.597948593966255e-07, "logits/chosen": -1.4459021091461182, "logits/rejected": -1.0879814624786377, "logps/chosen": -223.1853485107422, "logps/rejected": -255.0786895751953, "loss": 0.6779, "rewards/accuracies": 0.875, "rewards/chosen": 0.030809756368398666, "rewards/margins": 0.026754096150398254, "rewards/margins_max": 0.04218381643295288, "rewards/margins_min": 0.011324380524456501, "rewards/margins_std": 0.02182091772556305, "rewards/rejected": 0.004055661149322987, "step": 330 }, { "epoch": 0.27, "grad_norm": 0.287109375, "learning_rate": 4.558990599679787e-07, "logits/chosen": -1.4692085981369019, "logits/rejected": -0.8924602270126343, "logps/chosen": -220.9850311279297, "logps/rejected": -194.81997680664062, "loss": 0.6787, "rewards/accuracies": 0.824999988079071, "rewards/chosen": 0.02502889558672905, "rewards/margins": 0.02721070684492588, "rewards/margins_max": 0.04098900780081749, "rewards/margins_min": 0.013432410545647144, "rewards/margins_std": 0.019485458731651306, "rewards/rejected": -0.0021818140521645546, "step": 340 }, { "epoch": 0.28, "grad_norm": 0.283203125, "learning_rate": 4.518412700234406e-07, "logits/chosen": -1.5222501754760742, "logits/rejected": -1.1125866174697876, "logps/chosen": -209.9710693359375, "logps/rejected": -231.05630493164062, "loss": 0.6787, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 0.027966121211647987, "rewards/margins": 0.0314217247068882, "rewards/margins_max": 0.05127834156155586, "rewards/margins_min": 0.011565105989575386, "rewards/margins_std": 0.02808150090277195, "rewards/rejected": -0.0034556067548692226, "step": 350 }, { "epoch": 0.29, "grad_norm": 0.296875, "learning_rate": 4.4762468201802584e-07, "logits/chosen": -1.5034544467926025, "logits/rejected": -1.0958225727081299, "logps/chosen": -254.0304718017578, "logps/rejected": -231.09994506835938, "loss": 0.6769, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 0.04463629052042961, "rewards/margins": 0.03593505546450615, "rewards/margins_max": 0.0535309724509716, "rewards/margins_min": 0.01833912916481495, "rewards/margins_std": 0.02488439343869686, "rewards/rejected": 0.00870124064385891, "step": 360 }, { "epoch": 0.3, "grad_norm": 0.314453125, "learning_rate": 4.432526133406842e-07, "logits/chosen": -1.505198359489441, "logits/rejected": -1.0634045600891113, "logps/chosen": -201.5790557861328, "logps/rejected": -209.3098907470703, "loss": 0.6768, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 0.02870224416255951, "rewards/margins": 0.0318998247385025, "rewards/margins_max": 0.04702283442020416, "rewards/margins_min": 0.01677681878209114, "rewards/margins_std": 0.021387161687016487, "rewards/rejected": -0.003197581972926855, "step": 370 }, { "epoch": 0.31, "grad_norm": 0.3984375, "learning_rate": 4.38728503704354e-07, "logits/chosen": -1.2524049282073975, "logits/rejected": -1.0025460720062256, "logps/chosen": -220.7375946044922, "logps/rejected": -209.93576049804688, "loss": 0.6767, "rewards/accuracies": 0.875, "rewards/chosen": 0.025213871151208878, "rewards/margins": 0.029323795810341835, "rewards/margins_max": 0.04682663455605507, "rewards/margins_min": 0.011820957995951176, "rewards/margins_std": 0.024752752855420113, "rewards/rejected": -0.004109926056116819, "step": 380 }, { "epoch": 0.31, "grad_norm": 0.337890625, "learning_rate": 4.3405591243977734e-07, "logits/chosen": -1.422179937362671, "logits/rejected": -1.1084346771240234, "logps/chosen": -194.7821807861328, "logps/rejected": -230.63720703125, "loss": 0.6745, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 0.0322599820792675, "rewards/margins": 0.034069500863552094, "rewards/margins_max": 0.05064218491315842, "rewards/margins_min": 0.01749682053923607, "rewards/margins_std": 0.023437311872839928, "rewards/rejected": -0.001809517852962017, "step": 390 }, { "epoch": 0.32, "grad_norm": 0.33203125, "learning_rate": 4.292385156952068e-07, "logits/chosen": -1.275820016860962, "logits/rejected": -0.9008271098136902, "logps/chosen": -230.4395294189453, "logps/rejected": -231.0166778564453, "loss": 0.6745, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.034222401678562164, "rewards/margins": 0.040948402136564255, "rewards/margins_max": 0.06782921403646469, "rewards/margins_min": 0.01406758464872837, "rewards/margins_std": 0.038015216588974, "rewards/rejected": -0.006726003251969814, "step": 400 }, { "epoch": 0.33, "grad_norm": 0.357421875, "learning_rate": 4.242801035442058e-07, "logits/chosen": -1.5514808893203735, "logits/rejected": -1.1798508167266846, "logps/chosen": -223.0844268798828, "logps/rejected": -258.3182067871094, "loss": 0.6736, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 0.03439949080348015, "rewards/margins": 0.03853056952357292, "rewards/margins_max": 0.05988907068967819, "rewards/margins_min": 0.01717207580804825, "rewards/margins_std": 0.03020547330379486, "rewards/rejected": -0.00413108104839921, "step": 410 }, { "epoch": 0.34, "grad_norm": 0.31640625, "learning_rate": 4.1918457700381854e-07, "logits/chosen": -1.3427702188491821, "logits/rejected": -1.0590754747390747, "logps/chosen": -231.7524871826172, "logps/rejected": -223.934326171875, "loss": 0.6749, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 0.024727266281843185, "rewards/margins": 0.025817757472395897, "rewards/margins_max": 0.039815209805965424, "rewards/margins_min": 0.01182030700147152, "rewards/margins_std": 0.019795384258031845, "rewards/rejected": -0.0010904900263994932, "step": 420 }, { "epoch": 0.35, "grad_norm": 0.37890625, "learning_rate": 4.1395594496545603e-07, "logits/chosen": -1.4541248083114624, "logits/rejected": -0.9760845303535461, "logps/chosen": -210.18905639648438, "logps/rejected": -211.36294555664062, "loss": 0.6715, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.04199836403131485, "rewards/margins": 0.04588542506098747, "rewards/margins_max": 0.07101242244243622, "rewards/margins_min": 0.020758425816893578, "rewards/margins_std": 0.035534944385290146, "rewards/rejected": -0.0038870591670274734, "step": 430 }, { "epoch": 0.35, "grad_norm": 0.212890625, "learning_rate": 4.0859832104091136e-07, "logits/chosen": -1.6024078130722046, "logits/rejected": -0.9701696634292603, "logps/chosen": -295.61138916015625, "logps/rejected": -230.13247680664062, "loss": 0.6753, "rewards/accuracies": 0.875, "rewards/chosen": 0.03962088003754616, "rewards/margins": 0.04757533222436905, "rewards/margins_max": 0.06691477447748184, "rewards/margins_min": 0.028235893696546555, "rewards/margins_std": 0.027350088581442833, "rewards/rejected": -0.007954450324177742, "step": 440 }, { "epoch": 0.36, "grad_norm": 0.275390625, "learning_rate": 4.031159203259875e-07, "logits/chosen": -1.4899638891220093, "logits/rejected": -1.0765782594680786, "logps/chosen": -227.9605255126953, "logps/rejected": -226.0135498046875, "loss": 0.6753, "rewards/accuracies": 0.875, "rewards/chosen": 0.04151952639222145, "rewards/margins": 0.03870735317468643, "rewards/margins_max": 0.061690159142017365, "rewards/margins_min": 0.015724550932645798, "rewards/margins_std": 0.032502591609954834, "rewards/rejected": 0.0028121701907366514, "step": 450 }, { "epoch": 0.37, "grad_norm": 0.345703125, "learning_rate": 3.9751305608428204e-07, "logits/chosen": -1.5896000862121582, "logits/rejected": -0.9314081072807312, "logps/chosen": -202.0727996826172, "logps/rejected": -199.5994873046875, "loss": 0.6744, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 0.04337175562977791, "rewards/margins": 0.04664859548211098, "rewards/margins_max": 0.06958520412445068, "rewards/margins_min": 0.023711994290351868, "rewards/margins_std": 0.03243725746870041, "rewards/rejected": -0.0032768447417765856, "step": 460 }, { "epoch": 0.38, "grad_norm": 0.349609375, "learning_rate": 3.917941363537389e-07, "logits/chosen": -1.4111865758895874, "logits/rejected": -1.120785117149353, "logps/chosen": -225.33251953125, "logps/rejected": -242.3591766357422, "loss": 0.6734, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 0.03831241652369499, "rewards/margins": 0.04175466671586037, "rewards/margins_max": 0.062208663672208786, "rewards/margins_min": 0.021300669759511948, "rewards/margins_std": 0.028926318511366844, "rewards/rejected": -0.0034422471653670073, "step": 470 }, { "epoch": 0.39, "grad_norm": 0.322265625, "learning_rate": 3.8596366047863713e-07, "logits/chosen": -1.3947049379348755, "logits/rejected": -0.9301086664199829, "logps/chosen": -230.3119354248047, "logps/rejected": -210.1667938232422, "loss": 0.6736, "rewards/accuracies": 0.875, "rewards/chosen": 0.03262932971119881, "rewards/margins": 0.03960399329662323, "rewards/margins_max": 0.06856902688741684, "rewards/margins_min": 0.0106389494612813, "rewards/margins_std": 0.04096274822950363, "rewards/rejected": -0.006974661257117987, "step": 480 }, { "epoch": 0.39, "grad_norm": 0.328125, "learning_rate": 3.800262155697436e-07, "logits/chosen": -1.3944153785705566, "logits/rejected": -1.137308120727539, "logps/chosen": -234.61203002929688, "logps/rejected": -234.4260711669922, "loss": 0.6709, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 0.03722482547163963, "rewards/margins": 0.03539041429758072, "rewards/margins_max": 0.05553443357348442, "rewards/margins_min": 0.015246398746967316, "rewards/margins_std": 0.02848794497549534, "rewards/rejected": 0.0018344109412282705, "step": 490 }, { "epoch": 0.4, "grad_norm": 0.365234375, "learning_rate": 3.7398647289541694e-07, "logits/chosen": -1.375815749168396, "logits/rejected": -0.90796959400177, "logps/chosen": -281.0382385253906, "logps/rejected": -274.0350646972656, "loss": 0.6709, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": 0.05070463940501213, "rewards/margins": 0.042610641568899155, "rewards/margins_max": 0.07397975027561188, "rewards/margins_min": 0.011241519823670387, "rewards/margins_std": 0.0443626344203949, "rewards/rejected": 0.008093999698758125, "step": 500 }, { "epoch": 0.41, "grad_norm": 0.29296875, "learning_rate": 3.6784918420649944e-07, "logits/chosen": -1.46445631980896, "logits/rejected": -1.071272611618042, "logps/chosen": -255.32406616210938, "logps/rejected": -228.7417449951172, "loss": 0.6708, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 0.049185942858457565, "rewards/margins": 0.04181582108139992, "rewards/margins_max": 0.06617414206266403, "rewards/margins_min": 0.017457496374845505, "rewards/margins_std": 0.03444787114858627, "rewards/rejected": 0.0073701245710253716, "step": 510 }, { "epoch": 0.42, "grad_norm": 0.263671875, "learning_rate": 3.616191779978907e-07, "logits/chosen": -1.3251069784164429, "logits/rejected": -0.9809337854385376, "logps/chosen": -245.78067016601562, "logps/rejected": -212.802734375, "loss": 0.6714, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 0.036992188543081284, "rewards/margins": 0.039638452231884, "rewards/margins_max": 0.06334976106882095, "rewards/margins_min": 0.0159271452575922, "rewards/margins_std": 0.03353285789489746, "rewards/rejected": -0.0026462660171091557, "step": 520 }, { "epoch": 0.43, "grad_norm": 0.380859375, "learning_rate": 3.5530135570974273e-07, "logits/chosen": -1.3817245960235596, "logits/rejected": -1.1675258874893188, "logps/chosen": -220.3159942626953, "logps/rejected": -234.587890625, "loss": 0.6735, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": 0.026155227795243263, "rewards/margins": 0.03257446736097336, "rewards/margins_max": 0.05421394854784012, "rewards/margins_min": 0.01093499269336462, "rewards/margins_std": 0.03060283698141575, "rewards/rejected": -0.006419241428375244, "step": 530 }, { "epoch": 0.43, "grad_norm": 0.322265625, "learning_rate": 3.489006878712647e-07, "logits/chosen": -1.3741796016693115, "logits/rejected": -1.0641679763793945, "logps/chosen": -240.35317993164062, "logps/rejected": -219.55654907226562, "loss": 0.671, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 0.04678519815206528, "rewards/margins": 0.040003955364227295, "rewards/margins_max": 0.06497316062450409, "rewards/margins_min": 0.015034748241305351, "rewards/margins_std": 0.03531179204583168, "rewards/rejected": 0.006781242787837982, "step": 540 }, { "epoch": 0.44, "grad_norm": 0.32421875, "learning_rate": 3.4242221019017376e-07, "logits/chosen": -1.4753143787384033, "logits/rejected": -1.0424844026565552, "logps/chosen": -248.69302368164062, "logps/rejected": -219.41763305664062, "loss": 0.6694, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 0.04296140745282173, "rewards/margins": 0.04576890915632248, "rewards/margins_max": 0.06679528951644897, "rewards/margins_min": 0.024742530658841133, "rewards/margins_std": 0.02973579242825508, "rewards/rejected": -0.0028075044974684715, "step": 550 }, { "epoch": 0.45, "grad_norm": 0.3125, "learning_rate": 3.3587101959086524e-07, "logits/chosen": -1.5347524881362915, "logits/rejected": -1.1209291219711304, "logps/chosen": -278.2806701660156, "logps/rejected": -257.2729797363281, "loss": 0.6733, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 0.045897580683231354, "rewards/margins": 0.04410725086927414, "rewards/margins_max": 0.06342413276433945, "rewards/margins_min": 0.024790368974208832, "rewards/margins_std": 0.02731819823384285, "rewards/rejected": 0.0017903331900015473, "step": 560 }, { "epoch": 0.46, "grad_norm": 0.326171875, "learning_rate": 3.29252270204422e-07, "logits/chosen": -1.5542300939559937, "logits/rejected": -0.8301711082458496, "logps/chosen": -303.486328125, "logps/rejected": -225.38955688476562, "loss": 0.6703, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 0.05749259144067764, "rewards/margins": 0.06660069525241852, "rewards/margins_max": 0.09760721772909164, "rewards/margins_min": 0.035594161599874496, "rewards/margins_std": 0.043849848210811615, "rewards/rejected": -0.009108101017773151, "step": 570 }, { "epoch": 0.47, "grad_norm": 0.23828125, "learning_rate": 3.2257116931361555e-07, "logits/chosen": -1.3905324935913086, "logits/rejected": -1.0293217897415161, "logps/chosen": -236.79159545898438, "logps/rejected": -263.9286804199219, "loss": 0.6724, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 0.041198089718818665, "rewards/margins": 0.03653653711080551, "rewards/margins_max": 0.05748134106397629, "rewards/margins_min": 0.015591728501021862, "rewards/margins_std": 0.02962043322622776, "rewards/rejected": 0.004661554470658302, "step": 580 }, { "epoch": 0.47, "grad_norm": 0.265625, "learning_rate": 3.158329732560912e-07, "logits/chosen": -1.3383656740188599, "logits/rejected": -1.0018999576568604, "logps/chosen": -253.00485229492188, "logps/rejected": -225.4741668701172, "loss": 0.6695, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 0.03987802192568779, "rewards/margins": 0.04215165227651596, "rewards/margins_max": 0.07311449199914932, "rewards/margins_min": 0.011188811622560024, "rewards/margins_std": 0.04378807544708252, "rewards/rejected": -0.00227363221347332, "step": 590 }, { "epoch": 0.48, "grad_norm": 0.2109375, "learning_rate": 3.090429832889586e-07, "logits/chosen": -1.4599800109863281, "logits/rejected": -1.127780556678772, "logps/chosen": -241.9589080810547, "logps/rejected": -239.2523956298828, "loss": 0.6731, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 0.03444468602538109, "rewards/margins": 0.042148418724536896, "rewards/margins_max": 0.06826899945735931, "rewards/margins_min": 0.016027843579649925, "rewards/margins_std": 0.0369400680065155, "rewards/rejected": -0.007703735027462244, "step": 600 }, { "epoch": 0.49, "grad_norm": 0.341796875, "learning_rate": 3.022065414180425e-07, "logits/chosen": -1.318530797958374, "logits/rejected": -1.011063814163208, "logps/chosen": -220.6621551513672, "logps/rejected": -254.8894805908203, "loss": 0.67, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 0.03605925291776657, "rewards/margins": 0.04434169456362724, "rewards/margins_max": 0.06485582888126373, "rewards/margins_min": 0.02382757142186165, "rewards/margins_std": 0.029011353850364685, "rewards/rejected": -0.008282448165118694, "step": 610 }, { "epoch": 0.5, "grad_norm": 0.2451171875, "learning_rate": 2.953290261950746e-07, "logits/chosen": -1.5934768915176392, "logits/rejected": -1.083901047706604, "logps/chosen": -247.7360382080078, "logps/rejected": -254.74172973632812, "loss": 0.6687, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.0549856536090374, "rewards/margins": 0.05393069237470627, "rewards/margins_max": 0.08093124628067017, "rewards/margins_min": 0.026930134743452072, "rewards/margins_std": 0.03818455711007118, "rewards/rejected": 0.0010549660073593259, "step": 620 }, { "epoch": 0.51, "grad_norm": 0.404296875, "learning_rate": 2.884158484861325e-07, "logits/chosen": -1.2959980964660645, "logits/rejected": -1.0696645975112915, "logps/chosen": -243.71060180664062, "logps/rejected": -222.5481414794922, "loss": 0.6709, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 0.03977753221988678, "rewards/margins": 0.05109265446662903, "rewards/margins_max": 0.07561665773391724, "rewards/margins_min": 0.02656865119934082, "rewards/margins_std": 0.034682177007198334, "rewards/rejected": -0.011315122246742249, "step": 630 }, { "epoch": 0.51, "grad_norm": 0.33203125, "learning_rate": 2.8147244721465633e-07, "logits/chosen": -1.4550827741622925, "logits/rejected": -0.958722710609436, "logps/chosen": -279.07147216796875, "logps/rejected": -229.49014282226562, "loss": 0.6682, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 0.04491620138287544, "rewards/margins": 0.052156962454319, "rewards/margins_max": 0.07826672494411469, "rewards/margins_min": 0.026047203689813614, "rewards/margins_std": 0.03692477568984032, "rewards/rejected": -0.007240762002766132, "step": 640 }, { "epoch": 0.52, "grad_norm": 0.30859375, "learning_rate": 2.745042850823902e-07, "logits/chosen": -1.4900524616241455, "logits/rejected": -1.0514090061187744, "logps/chosen": -285.2456359863281, "logps/rejected": -279.95697021484375, "loss": 0.672, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 0.0385371670126915, "rewards/margins": 0.04171828180551529, "rewards/margins_max": 0.06382079422473907, "rewards/margins_min": 0.019615760073065758, "rewards/margins_std": 0.03125768154859543, "rewards/rejected": -0.003181110369041562, "step": 650 }, { "epoch": 0.53, "grad_norm": 0.35546875, "learning_rate": 2.6751684427161683e-07, "logits/chosen": -1.465968132019043, "logits/rejected": -0.9878286123275757, "logps/chosen": -242.22793579101562, "logps/rejected": -262.17523193359375, "loss": 0.6708, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 0.046739913523197174, "rewards/margins": 0.05252423882484436, "rewards/margins_max": 0.08502652496099472, "rewards/margins_min": 0.020021939650177956, "rewards/margins_std": 0.04596519097685814, "rewards/rejected": -0.005784327629953623, "step": 660 }, { "epoch": 0.54, "grad_norm": 0.2451171875, "learning_rate": 2.605156221320663e-07, "logits/chosen": -1.4037545919418335, "logits/rejected": -1.0441503524780273, "logps/chosen": -211.6615447998047, "logps/rejected": -209.14291381835938, "loss": 0.6712, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": 0.03919681906700134, "rewards/margins": 0.04562688618898392, "rewards/margins_max": 0.06264550983905792, "rewards/margins_min": 0.028608258813619614, "rewards/margins_std": 0.024067968130111694, "rewards/rejected": -0.006430068518966436, "step": 670 }, { "epoch": 0.55, "grad_norm": 0.263671875, "learning_rate": 2.5350612685589056e-07, "logits/chosen": -1.376564621925354, "logits/rejected": -0.7623709440231323, "logps/chosen": -282.1781311035156, "logps/rejected": -230.0469512939453, "loss": 0.6661, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 0.050868190824985504, "rewards/margins": 0.0660594254732132, "rewards/margins_max": 0.1021636500954628, "rewards/margins_min": 0.029955202713608742, "rewards/margins_std": 0.05105908587574959, "rewards/rejected": -0.01519124023616314, "step": 680 }, { "epoch": 0.55, "grad_norm": 0.2490234375, "learning_rate": 2.464938731441094e-07, "logits/chosen": -1.3413498401641846, "logits/rejected": -0.9205705523490906, "logps/chosen": -223.13720703125, "logps/rejected": -215.90542602539062, "loss": 0.6699, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 0.034288205206394196, "rewards/margins": 0.045619405806064606, "rewards/margins_max": 0.0685984343290329, "rewards/margins_min": 0.022640381008386612, "rewards/margins_std": 0.032497256994247437, "rewards/rejected": -0.01133120246231556, "step": 690 }, { "epoch": 0.56, "grad_norm": 0.27734375, "learning_rate": 2.3948437786793377e-07, "logits/chosen": -1.3168880939483643, "logits/rejected": -1.021528720855713, "logps/chosen": -232.2949981689453, "logps/rejected": -201.44692993164062, "loss": 0.6689, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 0.03319329768419266, "rewards/margins": 0.037660352885723114, "rewards/margins_max": 0.06672637164592743, "rewards/margins_min": 0.008594331331551075, "rewards/margins_std": 0.04110556095838547, "rewards/rejected": -0.004467054270207882, "step": 700 }, { "epoch": 0.57, "grad_norm": 0.302734375, "learning_rate": 2.3248315572838315e-07, "logits/chosen": -1.5475190877914429, "logits/rejected": -1.1727778911590576, "logps/chosen": -187.8809356689453, "logps/rejected": -202.12332153320312, "loss": 0.671, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 0.03501967713236809, "rewards/margins": 0.04390306398272514, "rewards/margins_max": 0.06672258675098419, "rewards/margins_min": 0.021083541214466095, "rewards/margins_std": 0.03227167949080467, "rewards/rejected": -0.008883384987711906, "step": 710 }, { "epoch": 0.58, "grad_norm": 0.2197265625, "learning_rate": 2.2549571491760981e-07, "logits/chosen": -1.6752458810806274, "logits/rejected": -1.3541884422302246, "logps/chosen": -200.15052795410156, "logps/rejected": -202.5867919921875, "loss": 0.6693, "rewards/accuracies": 0.824999988079071, "rewards/chosen": 0.03528157249093056, "rewards/margins": 0.04764563962817192, "rewards/margins_max": 0.07624481618404388, "rewards/margins_min": 0.019046466797590256, "rewards/margins_std": 0.04044533520936966, "rewards/rejected": -0.012364069931209087, "step": 720 }, { "epoch": 0.59, "grad_norm": 0.359375, "learning_rate": 2.185275527853437e-07, "logits/chosen": -1.5094270706176758, "logits/rejected": -1.1543543338775635, "logps/chosen": -204.6774444580078, "logps/rejected": -197.69451904296875, "loss": 0.6709, "rewards/accuracies": 0.824999988079071, "rewards/chosen": 0.038943078368902206, "rewards/margins": 0.04444164037704468, "rewards/margins_max": 0.07106862962245941, "rewards/margins_min": 0.017814649268984795, "rewards/margins_std": 0.037656255066394806, "rewards/rejected": -0.005498562939465046, "step": 730 }, { "epoch": 0.59, "grad_norm": 0.306640625, "learning_rate": 2.1158415151386743e-07, "logits/chosen": -1.4634640216827393, "logits/rejected": -0.944066047668457, "logps/chosen": -251.1621856689453, "logps/rejected": -231.7174530029297, "loss": 0.6677, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 0.044490996748209, "rewards/margins": 0.05201805755496025, "rewards/margins_max": 0.06704308092594147, "rewards/margins_min": 0.03699304163455963, "rewards/margins_std": 0.0212485883384943, "rewards/rejected": -0.007527066860347986, "step": 740 }, { "epoch": 0.6, "grad_norm": 0.287109375, "learning_rate": 2.0467097380492543e-07, "logits/chosen": -1.4197427034378052, "logits/rejected": -0.9803106188774109, "logps/chosen": -213.04525756835938, "logps/rejected": -206.525634765625, "loss": 0.6677, "rewards/accuracies": 0.824999988079071, "rewards/chosen": 0.0434570387005806, "rewards/margins": 0.047513801604509354, "rewards/margins_max": 0.07722840458154678, "rewards/margins_min": 0.017799200490117073, "rewards/margins_std": 0.042022787034511566, "rewards/rejected": -0.00405676057562232, "step": 750 }, { "epoch": 0.61, "grad_norm": 0.35546875, "learning_rate": 1.9779345858195756e-07, "logits/chosen": -1.3926583528518677, "logits/rejected": -1.076912522315979, "logps/chosen": -238.19656372070312, "logps/rejected": -217.7411346435547, "loss": 0.6673, "rewards/accuracies": 0.875, "rewards/chosen": 0.03039366379380226, "rewards/margins": 0.037511684000492096, "rewards/margins_max": 0.06615015864372253, "rewards/margins_min": 0.008873210288584232, "rewards/margins_std": 0.04050092026591301, "rewards/rejected": -0.0071180230006575584, "step": 760 }, { "epoch": 0.62, "grad_norm": 0.322265625, "learning_rate": 1.9095701671104148e-07, "logits/chosen": -1.4074318408966064, "logits/rejected": -1.0390832424163818, "logps/chosen": -243.4113006591797, "logps/rejected": -247.462646484375, "loss": 0.6632, "rewards/accuracies": 0.875, "rewards/chosen": 0.056570518761873245, "rewards/margins": 0.05667363852262497, "rewards/margins_max": 0.08418169617652893, "rewards/margins_min": 0.029165586456656456, "rewards/margins_std": 0.03890226036310196, "rewards/rejected": -0.00010311976075172424, "step": 770 }, { "epoch": 0.63, "grad_norm": 0.3359375, "learning_rate": 1.8416702674390878e-07, "logits/chosen": -1.3678066730499268, "logits/rejected": -0.8871816396713257, "logps/chosen": -259.7790832519531, "logps/rejected": -197.2100830078125, "loss": 0.6715, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 0.03565958887338638, "rewards/margins": 0.04336465895175934, "rewards/margins_max": 0.07163818180561066, "rewards/margins_min": 0.0150911258533597, "rewards/margins_std": 0.0399848073720932, "rewards/rejected": -0.007705070078372955, "step": 780 }, { "epoch": 0.63, "grad_norm": 0.28125, "learning_rate": 1.7742883068638445e-07, "logits/chosen": -1.4952503442764282, "logits/rejected": -0.9382287263870239, "logps/chosen": -298.07562255859375, "logps/rejected": -226.93359375, "loss": 0.6704, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 0.05212609842419624, "rewards/margins": 0.05277561396360397, "rewards/margins_max": 0.08954766392707825, "rewards/margins_min": 0.016003567725419998, "rewards/margins_std": 0.05200352519750595, "rewards/rejected": -0.0006495246780104935, "step": 790 }, { "epoch": 0.64, "grad_norm": 0.310546875, "learning_rate": 1.70747729795578e-07, "logits/chosen": -1.4829334020614624, "logits/rejected": -1.0575454235076904, "logps/chosen": -233.4453582763672, "logps/rejected": -262.2522888183594, "loss": 0.6712, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 0.04883468151092529, "rewards/margins": 0.05357099696993828, "rewards/margins_max": 0.08339022099971771, "rewards/margins_min": 0.023751774802803993, "rewards/margins_std": 0.042170751839876175, "rewards/rejected": -0.004736318252980709, "step": 800 }, { "epoch": 0.65, "grad_norm": 0.298828125, "learning_rate": 1.641289804091347e-07, "logits/chosen": -1.5801312923431396, "logits/rejected": -1.2438874244689941, "logps/chosen": -204.72030639648438, "logps/rejected": -268.27166748046875, "loss": 0.6702, "rewards/accuracies": 0.824999988079071, "rewards/chosen": 0.04398995637893677, "rewards/margins": 0.04688789322972298, "rewards/margins_max": 0.06833849847316742, "rewards/margins_min": 0.025437291711568832, "rewards/margins_std": 0.03033572994172573, "rewards/rejected": -0.002897942438721657, "step": 810 }, { "epoch": 0.66, "grad_norm": 0.306640625, "learning_rate": 1.5757778980982624e-07, "logits/chosen": -1.3541462421417236, "logits/rejected": -0.9108270406723022, "logps/chosen": -230.75039672851562, "logps/rejected": -223.6853485107422, "loss": 0.6707, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 0.042187005281448364, "rewards/margins": 0.039305832237005234, "rewards/margins_max": 0.059637099504470825, "rewards/margins_min": 0.01897456869482994, "rewards/margins_std": 0.028752749785780907, "rewards/rejected": 0.002881168620660901, "step": 820 }, { "epoch": 0.67, "grad_norm": 0.330078125, "learning_rate": 1.5109931212873534e-07, "logits/chosen": -1.7181932926177979, "logits/rejected": -1.0062940120697021, "logps/chosen": -280.510986328125, "logps/rejected": -280.473876953125, "loss": 0.6646, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 0.056802086532115936, "rewards/margins": 0.0675993412733078, "rewards/margins_max": 0.10450136661529541, "rewards/margins_min": 0.03069731593132019, "rewards/margins_std": 0.05218734219670296, "rewards/rejected": -0.010797259397804737, "step": 830 }, { "epoch": 0.67, "grad_norm": 0.353515625, "learning_rate": 1.4469864429025738e-07, "logits/chosen": -1.2235758304595947, "logits/rejected": -1.028109073638916, "logps/chosen": -214.80874633789062, "logps/rejected": -197.6857452392578, "loss": 0.6714, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 0.03331439942121506, "rewards/margins": 0.040659673511981964, "rewards/margins_max": 0.06538619846105576, "rewards/margins_min": 0.015933137387037277, "rewards/margins_std": 0.03496859595179558, "rewards/rejected": -0.007345269434154034, "step": 840 }, { "epoch": 0.68, "grad_norm": 0.263671875, "learning_rate": 1.383808220021093e-07, "logits/chosen": -1.495947241783142, "logits/rejected": -1.0667431354522705, "logps/chosen": -210.7353057861328, "logps/rejected": -248.39169311523438, "loss": 0.6699, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.036492932587862015, "rewards/margins": 0.04973422735929489, "rewards/margins_max": 0.08226160705089569, "rewards/margins_min": 0.017206856980919838, "rewards/margins_std": 0.0460006520152092, "rewards/rejected": -0.013241296634078026, "step": 850 }, { "epoch": 0.69, "grad_norm": 0.373046875, "learning_rate": 1.3215081579350056e-07, "logits/chosen": -1.412252426147461, "logits/rejected": -0.9245221018791199, "logps/chosen": -268.7840576171875, "logps/rejected": -228.3775634765625, "loss": 0.668, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 0.044978540390729904, "rewards/margins": 0.05806935578584671, "rewards/margins_max": 0.08858608454465866, "rewards/margins_min": 0.02755262330174446, "rewards/margins_std": 0.043157171458005905, "rewards/rejected": -0.013090811669826508, "step": 860 }, { "epoch": 0.7, "grad_norm": 0.275390625, "learning_rate": 1.2601352710458312e-07, "logits/chosen": -1.3982574939727783, "logits/rejected": -1.0772249698638916, "logps/chosen": -252.6017303466797, "logps/rejected": -272.05438232421875, "loss": 0.6705, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 0.040421146899461746, "rewards/margins": 0.0438043586909771, "rewards/margins_max": 0.06623619049787521, "rewards/margins_min": 0.021372521296143532, "rewards/margins_std": 0.03172340989112854, "rewards/rejected": -0.003383214818313718, "step": 870 }, { "epoch": 0.71, "grad_norm": 0.373046875, "learning_rate": 1.1997378443025634e-07, "logits/chosen": -1.3360648155212402, "logits/rejected": -0.9048384428024292, "logps/chosen": -278.68487548828125, "logps/rejected": -276.01800537109375, "loss": 0.6697, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.03929431363940239, "rewards/margins": 0.053578175604343414, "rewards/margins_max": 0.08283337205648422, "rewards/margins_min": 0.024322964251041412, "rewards/margins_std": 0.041373111307621, "rewards/rejected": -0.014283858239650726, "step": 880 }, { "epoch": 0.71, "grad_norm": 0.248046875, "learning_rate": 1.1403633952136288e-07, "logits/chosen": -1.3096376657485962, "logits/rejected": -1.0314531326293945, "logps/chosen": -229.771484375, "logps/rejected": -259.46124267578125, "loss": 0.6709, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 0.04626529663801193, "rewards/margins": 0.044107310473918915, "rewards/margins_max": 0.07106141746044159, "rewards/margins_min": 0.01715320721268654, "rewards/margins_std": 0.038118861615657806, "rewards/rejected": 0.0021579854656010866, "step": 890 }, { "epoch": 0.72, "grad_norm": 0.34375, "learning_rate": 1.0820586364626102e-07, "logits/chosen": -1.4800525903701782, "logits/rejected": -1.0487653017044067, "logps/chosen": -226.35171508789062, "logps/rejected": -220.342041015625, "loss": 0.6699, "rewards/accuracies": 0.875, "rewards/chosen": 0.03789624944329262, "rewards/margins": 0.04697556048631668, "rewards/margins_max": 0.07865092158317566, "rewards/margins_min": 0.01530019473284483, "rewards/margins_std": 0.044795725494623184, "rewards/rejected": -0.009079309180378914, "step": 900 }, { "epoch": 0.73, "grad_norm": 0.310546875, "learning_rate": 1.0248694391571799e-07, "logits/chosen": -1.5295828580856323, "logits/rejected": -1.0794126987457275, "logps/chosen": -193.26638793945312, "logps/rejected": -178.80874633789062, "loss": 0.6691, "rewards/accuracies": 0.875, "rewards/chosen": 0.03320566937327385, "rewards/margins": 0.04501482471823692, "rewards/margins_max": 0.07719631493091583, "rewards/margins_min": 0.012833328917622566, "rewards/margins_std": 0.04551151022315025, "rewards/rejected": -0.011809155344963074, "step": 910 }, { "epoch": 0.74, "grad_norm": 0.306640625, "learning_rate": 9.688407967401247e-08, "logits/chosen": -1.5370663404464722, "logits/rejected": -1.0957996845245361, "logps/chosen": -262.6162109375, "logps/rejected": -238.84619140625, "loss": 0.6675, "rewards/accuracies": 0.875, "rewards/chosen": 0.04341619461774826, "rewards/margins": 0.041754595935344696, "rewards/margins_max": 0.0651809424161911, "rewards/margins_min": 0.018328242003917694, "rewards/margins_std": 0.03312985971570015, "rewards/rejected": 0.001661602407693863, "step": 920 }, { "epoch": 0.75, "grad_norm": 0.25, "learning_rate": 9.140167895908865e-08, "logits/chosen": -1.3569977283477783, "logits/rejected": -1.006054401397705, "logps/chosen": -229.92843627929688, "logps/rejected": -220.2310333251953, "loss": 0.6707, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": 0.03869252651929855, "rewards/margins": 0.04064718633890152, "rewards/margins_max": 0.06924192607402802, "rewards/margins_min": 0.012052436359226704, "rewards/margins_std": 0.04043908044695854, "rewards/rejected": -0.0019546542316675186, "step": 930 }, { "epoch": 0.76, "grad_norm": 0.3671875, "learning_rate": 8.604405503454399e-08, "logits/chosen": -1.4614689350128174, "logits/rejected": -0.9524902105331421, "logps/chosen": -257.48260498046875, "logps/rejected": -288.1595458984375, "loss": 0.6682, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 0.04461338371038437, "rewards/margins": 0.052332986146211624, "rewards/margins_max": 0.08000707626342773, "rewards/margins_min": 0.024658896028995514, "rewards/margins_std": 0.03913707286119461, "rewards/rejected": -0.007719600107520819, "step": 940 }, { "epoch": 0.76, "grad_norm": 0.29296875, "learning_rate": 8.081542299618138e-08, "logits/chosen": -1.4888681173324585, "logits/rejected": -1.0110059976577759, "logps/chosen": -271.58648681640625, "logps/rejected": -271.7052917480469, "loss": 0.6686, "rewards/accuracies": 0.875, "rewards/chosen": 0.05115025117993355, "rewards/margins": 0.046790122985839844, "rewards/margins_max": 0.07547706365585327, "rewards/margins_min": 0.018103178590536118, "rewards/margins_std": 0.04056946188211441, "rewards/rejected": 0.004360135179013014, "step": 950 }, { "epoch": 0.77, "grad_norm": 0.27734375, "learning_rate": 7.571989645579419e-08, "logits/chosen": -1.3773242235183716, "logits/rejected": -0.9570341110229492, "logps/chosen": -276.8484802246094, "logps/rejected": -292.5472717285156, "loss": 0.6689, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 0.04309418797492981, "rewards/margins": 0.04832986742258072, "rewards/margins_max": 0.0815201848745346, "rewards/margins_min": 0.015139535069465637, "rewards/margins_std": 0.0469382181763649, "rewards/rejected": -0.005235675722360611, "step": 960 }, { "epoch": 0.78, "grad_norm": 0.333984375, "learning_rate": 7.07614843047932e-08, "logits/chosen": -1.453420877456665, "logits/rejected": -1.0132516622543335, "logps/chosen": -231.8670654296875, "logps/rejected": -229.90701293945312, "loss": 0.669, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 0.039587706327438354, "rewards/margins": 0.04945527762174606, "rewards/margins_max": 0.08132112771272659, "rewards/margins_min": 0.017589423805475235, "rewards/margins_std": 0.045065123587846756, "rewards/rejected": -0.009867568500339985, "step": 970 }, { "epoch": 0.79, "grad_norm": 0.271484375, "learning_rate": 6.594408756022272e-08, "logits/chosen": -1.3352272510528564, "logits/rejected": -0.9246931076049805, "logps/chosen": -247.74301147460938, "logps/rejected": -251.9709014892578, "loss": 0.6694, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 0.043303996324539185, "rewards/margins": 0.044630784541368484, "rewards/margins_max": 0.06431631743907928, "rewards/margins_min": 0.024945247918367386, "rewards/margins_std": 0.027839556336402893, "rewards/rejected": -0.001326788100413978, "step": 980 }, { "epoch": 0.8, "grad_norm": 0.263671875, "learning_rate": 6.127149629564604e-08, "logits/chosen": -1.564333200454712, "logits/rejected": -0.989848792552948, "logps/chosen": -281.92041015625, "logps/rejected": -245.86181640625, "loss": 0.6669, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 0.04692433401942253, "rewards/margins": 0.05381812900304794, "rewards/margins_max": 0.07959593087434769, "rewards/margins_min": 0.028040319681167603, "rewards/margins_std": 0.036455318331718445, "rewards/rejected": -0.0068937926553189754, "step": 990 }, { "epoch": 0.8, "grad_norm": 0.3125, "learning_rate": 5.674738665931575e-08, "logits/chosen": -1.4035413265228271, "logits/rejected": -1.1324512958526611, "logps/chosen": -209.9371337890625, "logps/rejected": -225.2702178955078, "loss": 0.6696, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 0.044688571244478226, "rewards/margins": 0.03946073725819588, "rewards/margins_max": 0.06743495166301727, "rewards/margins_min": 0.011486515402793884, "rewards/margins_std": 0.039561524987220764, "rewards/rejected": 0.005227833054959774, "step": 1000 }, { "epoch": 0.81, "grad_norm": 0.2890625, "learning_rate": 5.2375317981974146e-08, "logits/chosen": -1.3417729139328003, "logits/rejected": -0.9221906661987305, "logps/chosen": -213.33682250976562, "logps/rejected": -210.5022735595703, "loss": 0.6683, "rewards/accuracies": 0.875, "rewards/chosen": 0.041867054998874664, "rewards/margins": 0.041164129972457886, "rewards/margins_max": 0.0679064616560936, "rewards/margins_min": 0.014421803876757622, "rewards/margins_std": 0.03781936317682266, "rewards/rejected": 0.0007029235130175948, "step": 1010 }, { "epoch": 0.82, "grad_norm": 0.291015625, "learning_rate": 4.81587299765594e-08, "logits/chosen": -1.6060631275177002, "logits/rejected": -0.9410206079483032, "logps/chosen": -268.5170593261719, "logps/rejected": -277.1543273925781, "loss": 0.6675, "rewards/accuracies": 0.875, "rewards/chosen": 0.05788182467222214, "rewards/margins": 0.06077752262353897, "rewards/margins_max": 0.09126977622509003, "rewards/margins_min": 0.030285265296697617, "rewards/margins_std": 0.043122559785842896, "rewards/rejected": -0.0028956946916878223, "step": 1020 }, { "epoch": 0.83, "grad_norm": 0.328125, "learning_rate": 4.410094003202133e-08, "logits/chosen": -1.4907371997833252, "logits/rejected": -1.1430540084838867, "logps/chosen": -206.7045440673828, "logps/rejected": -214.552734375, "loss": 0.6681, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 0.04669869691133499, "rewards/margins": 0.04530167207121849, "rewards/margins_max": 0.07714711129665375, "rewards/margins_min": 0.01345623005181551, "rewards/margins_std": 0.045036256313323975, "rewards/rejected": 0.001397026120685041, "step": 1030 }, { "epoch": 0.84, "grad_norm": 0.29296875, "learning_rate": 4.020514060337446e-08, "logits/chosen": -1.456228256225586, "logits/rejected": -0.9336725473403931, "logps/chosen": -240.61669921875, "logps/rejected": -231.38449096679688, "loss": 0.6686, "rewards/accuracies": 0.875, "rewards/chosen": 0.04265180975198746, "rewards/margins": 0.05336352065205574, "rewards/margins_max": 0.08747194707393646, "rewards/margins_min": 0.019255097955465317, "rewards/margins_std": 0.04823658615350723, "rewards/rejected": -0.010711712762713432, "step": 1040 }, { "epoch": 0.84, "grad_norm": 0.291015625, "learning_rate": 3.647439670004315e-08, "logits/chosen": -1.4223265647888184, "logits/rejected": -1.0329468250274658, "logps/chosen": -198.4597930908203, "logps/rejected": -196.15701293945312, "loss": 0.6718, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 0.038055285811424255, "rewards/margins": 0.03567874804139137, "rewards/margins_max": 0.05416553467512131, "rewards/margins_min": 0.017191965132951736, "rewards/margins_std": 0.02614426054060459, "rewards/rejected": 0.0023765391670167446, "step": 1050 }, { "epoch": 0.85, "grad_norm": 0.33203125, "learning_rate": 3.2911643474473644e-08, "logits/chosen": -1.3081260919570923, "logits/rejected": -0.9144113659858704, "logps/chosen": -200.09619140625, "logps/rejected": -189.586181640625, "loss": 0.6688, "rewards/accuracies": 0.875, "rewards/chosen": 0.03917083144187927, "rewards/margins": 0.04059765860438347, "rewards/margins_max": 0.068463034927845, "rewards/margins_min": 0.012732280418276787, "rewards/margins_std": 0.03940759226679802, "rewards/rejected": -0.0014268273953348398, "step": 1060 }, { "epoch": 0.86, "grad_norm": 0.26953125, "learning_rate": 2.9519683912911263e-08, "logits/chosen": -1.5238087177276611, "logits/rejected": -1.0078160762786865, "logps/chosen": -294.36871337890625, "logps/rejected": -224.5919647216797, "loss": 0.6699, "rewards/accuracies": 0.824999988079071, "rewards/chosen": 0.038640812039375305, "rewards/margins": 0.0416460782289505, "rewards/margins_max": 0.07079333066940308, "rewards/margins_min": 0.012498823925852776, "rewards/margins_std": 0.04122043773531914, "rewards/rejected": -0.0030052694492042065, "step": 1070 }, { "epoch": 0.87, "grad_norm": 0.34765625, "learning_rate": 2.6301186630158484e-08, "logits/chosen": -1.4236295223236084, "logits/rejected": -0.9124472737312317, "logps/chosen": -241.5544891357422, "logps/rejected": -206.624755859375, "loss": 0.6706, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": 0.047010261565446854, "rewards/margins": 0.04852353036403656, "rewards/margins_max": 0.07083114981651306, "rewards/margins_min": 0.026215914636850357, "rewards/margins_std": 0.03154773265123367, "rewards/rejected": -0.0015132713597267866, "step": 1080 }, { "epoch": 0.88, "grad_norm": 0.314453125, "learning_rate": 2.325868377004986e-08, "logits/chosen": -1.4360002279281616, "logits/rejected": -0.9177436828613281, "logps/chosen": -257.7289733886719, "logps/rejected": -248.4482879638672, "loss": 0.668, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.05283036828041077, "rewards/margins": 0.053740471601486206, "rewards/margins_max": 0.08627776801586151, "rewards/margins_min": 0.0212031789124012, "rewards/margins_std": 0.046014681458473206, "rewards/rejected": -0.0009101040777750313, "step": 1090 }, { "epoch": 0.88, "grad_norm": 0.267578125, "learning_rate": 2.0394569013294728e-08, "logits/chosen": -1.4799668788909912, "logits/rejected": -1.1490806341171265, "logps/chosen": -231.9544677734375, "logps/rejected": -240.9729766845703, "loss": 0.6685, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 0.038953814655542374, "rewards/margins": 0.049186062067747116, "rewards/margins_max": 0.0747087150812149, "rewards/margins_min": 0.02366340160369873, "rewards/margins_std": 0.03609449043869972, "rewards/rejected": -0.010232244618237019, "step": 1100 }, { "epoch": 0.89, "grad_norm": 0.25, "learning_rate": 1.7711095694255467e-08, "logits/chosen": -1.313668966293335, "logits/rejected": -0.9640175104141235, "logps/chosen": -276.17645263671875, "logps/rejected": -224.7978973388672, "loss": 0.6673, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 0.029724329710006714, "rewards/margins": 0.04676595330238342, "rewards/margins_max": 0.06878891587257385, "rewards/margins_min": 0.024742985144257545, "rewards/margins_std": 0.03114517591893673, "rewards/rejected": -0.01704162172973156, "step": 1110 }, { "epoch": 0.9, "grad_norm": 0.302734375, "learning_rate": 1.5210375028143095e-08, "logits/chosen": -1.4092978239059448, "logits/rejected": -1.117443323135376, "logps/chosen": -233.7253875732422, "logps/rejected": -247.8575897216797, "loss": 0.6674, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 0.041861675679683685, "rewards/margins": 0.04662462696433067, "rewards/margins_max": 0.07170717418193817, "rewards/margins_min": 0.02154208905994892, "rewards/margins_std": 0.0354720763862133, "rewards/rejected": -0.0047629522159695625, "step": 1120 }, { "epoch": 0.91, "grad_norm": 0.328125, "learning_rate": 1.2894374450024336e-08, "logits/chosen": -1.448512315750122, "logits/rejected": -1.0240614414215088, "logps/chosen": -260.4589538574219, "logps/rejected": -230.17886352539062, "loss": 0.6696, "rewards/accuracies": 0.824999988079071, "rewards/chosen": 0.04554584249854088, "rewards/margins": 0.039973270148038864, "rewards/margins_max": 0.06665637344121933, "rewards/margins_min": 0.013290156610310078, "rewards/margins_std": 0.03773561865091324, "rewards/rejected": 0.005572572350502014, "step": 1130 }, { "epoch": 0.92, "grad_norm": 0.3203125, "learning_rate": 1.0764916066947794e-08, "logits/chosen": -1.5009466409683228, "logits/rejected": -1.0196406841278076, "logps/chosen": -215.59872436523438, "logps/rejected": -219.1211700439453, "loss": 0.6659, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 0.04678153246641159, "rewards/margins": 0.057925593107938766, "rewards/margins_max": 0.08430875092744827, "rewards/margins_min": 0.03154244273900986, "rewards/margins_std": 0.03731141239404678, "rewards/rejected": -0.011144058778882027, "step": 1140 }, { "epoch": 0.92, "grad_norm": 0.345703125, "learning_rate": 8.823675224406052e-09, "logits/chosen": -1.7093722820281982, "logits/rejected": -0.9901024699211121, "logps/chosen": -273.6139221191406, "logps/rejected": -257.0510559082031, "loss": 0.6673, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 0.05794493108987808, "rewards/margins": 0.07558706402778625, "rewards/margins_max": 0.1190909594297409, "rewards/margins_min": 0.0320831760764122, "rewards/margins_std": 0.06152379512786865, "rewards/rejected": -0.01764213666319847, "step": 1150 }, { "epoch": 0.93, "grad_norm": 0.384765625, "learning_rate": 7.0721791882622505e-09, "logits/chosen": -1.357256531715393, "logits/rejected": -1.033753752708435, "logps/chosen": -221.84225463867188, "logps/rejected": -233.3285675048828, "loss": 0.6691, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 0.046824563294649124, "rewards/margins": 0.05287656933069229, "rewards/margins_max": 0.08426986634731293, "rewards/margins_min": 0.021483272314071655, "rewards/margins_std": 0.04439682140946388, "rewards/rejected": -0.006052007433027029, "step": 1160 }, { "epoch": 0.94, "grad_norm": 0.337890625, "learning_rate": 5.511805943178099e-09, "logits/chosen": -1.3903011083602905, "logits/rejected": -0.7860205769538879, "logps/chosen": -258.0101318359375, "logps/rejected": -215.1144256591797, "loss": 0.6653, "rewards/accuracies": 1.0, "rewards/chosen": 0.042999833822250366, "rewards/margins": 0.06220085546374321, "rewards/margins_max": 0.09922395646572113, "rewards/margins_min": 0.025177743285894394, "rewards/margins_std": 0.05235857889056206, "rewards/rejected": -0.019201014190912247, "step": 1170 }, { "epoch": 0.95, "grad_norm": 0.251953125, "learning_rate": 4.143783108487897e-09, "logits/chosen": -1.4224960803985596, "logits/rejected": -0.9319742321968079, "logps/chosen": -271.4730529785156, "logps/rejected": -208.79818725585938, "loss": 0.6725, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 0.030450427904725075, "rewards/margins": 0.03488145396113396, "rewards/margins_max": 0.050618767738342285, "rewards/margins_min": 0.019144145771861076, "rewards/margins_std": 0.02225591614842415, "rewards/rejected": -0.00443103164434433, "step": 1180 }, { "epoch": 0.96, "grad_norm": 0.357421875, "learning_rate": 2.9691869723728057e-09, "logits/chosen": -1.4451096057891846, "logits/rejected": -1.0561285018920898, "logps/chosen": -278.66851806640625, "logps/rejected": -304.5954895019531, "loss": 0.6675, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 0.04903879761695862, "rewards/margins": 0.05575891584157944, "rewards/margins_max": 0.0802365392446518, "rewards/margins_min": 0.03128129243850708, "rewards/margins_std": 0.03461658954620361, "rewards/rejected": -0.006720119621604681, "step": 1190 }, { "epoch": 0.96, "grad_norm": 0.279296875, "learning_rate": 1.9889416450938335e-09, "logits/chosen": -1.3228471279144287, "logits/rejected": -1.0721616744995117, "logps/chosen": -190.70054626464844, "logps/rejected": -229.17367553710938, "loss": 0.6699, "rewards/accuracies": 0.875, "rewards/chosen": 0.03468576818704605, "rewards/margins": 0.03673940151929855, "rewards/margins_max": 0.06133583188056946, "rewards/margins_min": 0.0121429692953825, "rewards/margins_std": 0.03478460758924484, "rewards/rejected": -0.0020536319352686405, "step": 1200 }, { "epoch": 0.97, "grad_norm": 0.38671875, "learning_rate": 1.2038183319507956e-09, "logits/chosen": -1.4177278280258179, "logits/rejected": -1.083184838294983, "logps/chosen": -229.7062530517578, "logps/rejected": -278.3479309082031, "loss": 0.6694, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 0.04728575423359871, "rewards/margins": 0.052900440990924835, "rewards/margins_max": 0.09008260071277618, "rewards/margins_min": 0.01571827009320259, "rewards/margins_std": 0.05258352681994438, "rewards/rejected": -0.005614686757326126, "step": 1210 }, { "epoch": 0.98, "grad_norm": 0.302734375, "learning_rate": 6.14434726538493e-10, "logits/chosen": -1.5169966220855713, "logits/rejected": -0.8769359588623047, "logps/chosen": -254.8049774169922, "logps/rejected": -220.7210693359375, "loss": 0.6692, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 0.050220292061567307, "rewards/margins": 0.058179594576358795, "rewards/margins_max": 0.0886971578001976, "rewards/margins_min": 0.02766202948987484, "rewards/margins_std": 0.04315835237503052, "rewards/rejected": -0.00795929878950119, "step": 1220 }, { "epoch": 0.99, "grad_norm": 0.34765625, "learning_rate": 2.2125452477828045e-10, "logits/chosen": -1.3784905672073364, "logits/rejected": -1.1377615928649902, "logps/chosen": -194.73486328125, "logps/rejected": -264.32476806640625, "loss": 0.6653, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 0.04400862753391266, "rewards/margins": 0.04775675758719444, "rewards/margins_max": 0.07108040899038315, "rewards/margins_min": 0.02443309873342514, "rewards/margins_std": 0.03298462927341461, "rewards/rejected": -0.0037481263279914856, "step": 1230 }, { "epoch": 1.0, "grad_norm": 0.337890625, "learning_rate": 2.4587060106245894e-11, "logits/chosen": -1.4393399953842163, "logits/rejected": -1.0665498971939087, "logps/chosen": -236.4451446533203, "logps/rejected": -271.6033020019531, "loss": 0.6681, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 0.04036145657300949, "rewards/margins": 0.050877369940280914, "rewards/margins_max": 0.07593311369419098, "rewards/margins_min": 0.025821629911661148, "rewards/margins_std": 0.03543417528271675, "rewards/rejected": -0.010515912435948849, "step": 1240 }, { "epoch": 1.0, "eval_logits/chosen": -1.023886799812317, "eval_logits/rejected": -0.897911787033081, "eval_logps/chosen": -331.2293701171875, "eval_logps/rejected": -327.6042175292969, "eval_loss": 0.6931236982345581, "eval_rewards/accuracies": 0.5450000166893005, "eval_rewards/chosen": 0.026809358969330788, "eval_rewards/margins": 0.0008149489294737577, "eval_rewards/margins_max": 0.06289197504520416, "eval_rewards/margins_min": -0.064235620200634, "eval_rewards/margins_std": 0.04212768003344536, "eval_rewards/rejected": 0.02599441073834896, "eval_runtime": 750.4368, "eval_samples_per_second": 5.33, "eval_steps_per_second": 0.167, "step": 1245 }, { "epoch": 1.0, "step": 1245, "total_flos": 0.0, "train_loss": 0.6747699532642901, "train_runtime": 11262.176, "train_samples_per_second": 1.768, "train_steps_per_second": 0.111 } ], "logging_steps": 10, "max_steps": 1245, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 100, "total_flos": 0.0, "train_batch_size": 2, "trial_name": null, "trial_params": null }