diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,18454 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 1.9992404101785035, + "eval_steps": 500, + "global_step": 1316, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0, + "learning_rate": 7.575757575757576e-09, + "logits/chosen": -1.3807122707366943, + "logits/rejected": -1.4181761741638184, + "logps/chosen": -67.83101654052734, + "logps/rejected": -72.67066955566406, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 1 + }, + { + "epoch": 0.0, + "learning_rate": 1.5151515151515152e-08, + "logits/chosen": -1.5273715257644653, + "logits/rejected": -1.59983491897583, + "logps/chosen": -63.50884246826172, + "logps/rejected": -72.82698059082031, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 2 + }, + { + "epoch": 0.0, + "learning_rate": 2.2727272727272725e-08, + "logits/chosen": -1.293489933013916, + "logits/rejected": -1.282623529434204, + "logps/chosen": -57.460914611816406, + "logps/rejected": -81.09254455566406, + "loss": 0.6937, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.0016430141404271126, + "rewards/margins": -0.0033949974458664656, + "rewards/rejected": 0.0017519830726087093, + "step": 3 + }, + { + "epoch": 0.01, + "learning_rate": 3.0303030303030305e-08, + "logits/chosen": -1.381151556968689, + "logits/rejected": -1.3468399047851562, + "logps/chosen": -52.592010498046875, + "logps/rejected": -66.39985656738281, + "loss": 0.6944, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.002350753638893366, + "rewards/margins": -0.0003365101292729378, + "rewards/rejected": -0.002014243509620428, + "step": 4 + }, + { + "epoch": 0.01, + "learning_rate": 3.787878787878788e-08, + "logits/chosen": -1.365487813949585, + "logits/rejected": -1.4028127193450928, + "logps/chosen": -48.654415130615234, + "logps/rejected": -48.227760314941406, + "loss": 0.694, + "rewards/accuracies": 0.4375, + "rewards/chosen": -0.004889738745987415, + "rewards/margins": -0.00731813907623291, + "rewards/rejected": 0.002428400330245495, + "step": 5 + }, + { + "epoch": 0.01, + "learning_rate": 4.545454545454545e-08, + "logits/chosen": -1.4787240028381348, + "logits/rejected": -1.6369270086288452, + "logps/chosen": -55.18429183959961, + "logps/rejected": -50.92931365966797, + "loss": 0.6947, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.0007439616019837558, + "rewards/margins": -0.0010132314637303352, + "rewards/rejected": 0.0002692697453312576, + "step": 6 + }, + { + "epoch": 0.01, + "learning_rate": 5.303030303030303e-08, + "logits/chosen": -1.3732951879501343, + "logits/rejected": -1.3683916330337524, + "logps/chosen": -75.06265258789062, + "logps/rejected": -93.20909118652344, + "loss": 0.6926, + "rewards/accuracies": 0.5625, + "rewards/chosen": 2.2769207134842873e-05, + "rewards/margins": 0.0028623221442103386, + "rewards/rejected": -0.0028395531699061394, + "step": 7 + }, + { + "epoch": 0.01, + "learning_rate": 6.060606060606061e-08, + "logits/chosen": -1.5228297710418701, + "logits/rejected": -1.6090095043182373, + "logps/chosen": -63.288326263427734, + "logps/rejected": -85.08869171142578, + "loss": 0.6934, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.004945838823914528, + "rewards/margins": -6.294244667515159e-05, + "rewards/rejected": -0.004882895387709141, + "step": 8 + }, + { + "epoch": 0.01, + "learning_rate": 6.818181818181817e-08, + "logits/chosen": -1.41131591796875, + "logits/rejected": -1.5036777257919312, + "logps/chosen": -66.2701644897461, + "logps/rejected": -61.75237274169922, + "loss": 0.6937, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.009852385148406029, + "rewards/margins": 0.008082007989287376, + "rewards/rejected": 0.0017703771591186523, + "step": 9 + }, + { + "epoch": 0.02, + "learning_rate": 7.575757575757576e-08, + "logits/chosen": -1.3245943784713745, + "logits/rejected": -1.297118067741394, + "logps/chosen": -62.03919219970703, + "logps/rejected": -86.58907318115234, + "loss": 0.6927, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.0011534811928868294, + "rewards/margins": -0.004392742644995451, + "rewards/rejected": 0.005546224303543568, + "step": 10 + }, + { + "epoch": 0.02, + "learning_rate": 8.333333333333333e-08, + "logits/chosen": -1.374742031097412, + "logits/rejected": -1.3923227787017822, + "logps/chosen": -63.650299072265625, + "logps/rejected": -75.1951904296875, + "loss": 0.6913, + "rewards/accuracies": 0.375, + "rewards/chosen": 1.7667189240455627e-05, + "rewards/margins": -0.0065226079896092415, + "rewards/rejected": 0.0065402742475271225, + "step": 11 + }, + { + "epoch": 0.02, + "learning_rate": 9.09090909090909e-08, + "logits/chosen": -1.3470630645751953, + "logits/rejected": -1.3567157983779907, + "logps/chosen": -58.94744873046875, + "logps/rejected": -63.96583557128906, + "loss": 0.6928, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.005938088521361351, + "rewards/margins": -0.006357109639793634, + "rewards/rejected": 0.00041902053635567427, + "step": 12 + }, + { + "epoch": 0.02, + "learning_rate": 9.848484848484848e-08, + "logits/chosen": -1.3704334497451782, + "logits/rejected": -1.4031784534454346, + "logps/chosen": -64.4774169921875, + "logps/rejected": -74.84626770019531, + "loss": 0.6925, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.0072515010833740234, + "rewards/margins": -0.0038907527923583984, + "rewards/rejected": -0.0033607487566769123, + "step": 13 + }, + { + "epoch": 0.02, + "learning_rate": 1.0606060606060605e-07, + "logits/chosen": -1.2817316055297852, + "logits/rejected": -1.2899329662322998, + "logps/chosen": -76.54801177978516, + "logps/rejected": -71.5182113647461, + "loss": 0.6941, + "rewards/accuracies": 0.3125, + "rewards/chosen": -0.005866062827408314, + "rewards/margins": -0.009477593004703522, + "rewards/rejected": 0.0036115292459726334, + "step": 14 + }, + { + "epoch": 0.02, + "learning_rate": 1.1363636363636363e-07, + "logits/chosen": -1.212456226348877, + "logits/rejected": -1.2076374292373657, + "logps/chosen": -44.21965789794922, + "logps/rejected": -67.9637680053711, + "loss": 0.6938, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.0028303861618041992, + "rewards/margins": -0.004396808333694935, + "rewards/rejected": 0.007227194495499134, + "step": 15 + }, + { + "epoch": 0.02, + "learning_rate": 1.2121212121212122e-07, + "logits/chosen": -1.4582772254943848, + "logits/rejected": -1.4182686805725098, + "logps/chosen": -45.72659683227539, + "logps/rejected": -59.42350769042969, + "loss": 0.6925, + "rewards/accuracies": 0.3125, + "rewards/chosen": -0.0005197167047299445, + "rewards/margins": -0.0064195385202765465, + "rewards/rejected": 0.0058998228050768375, + "step": 16 + }, + { + "epoch": 0.03, + "learning_rate": 1.2878787878787877e-07, + "logits/chosen": -1.4755264520645142, + "logits/rejected": -1.50910222530365, + "logps/chosen": -58.174442291259766, + "logps/rejected": -64.99958801269531, + "loss": 0.6939, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.0004933952586725354, + "rewards/margins": -0.0005321386270225048, + "rewards/rejected": 0.0010255335364490747, + "step": 17 + }, + { + "epoch": 0.03, + "learning_rate": 1.3636363636363635e-07, + "logits/chosen": -1.495653510093689, + "logits/rejected": -1.5463732481002808, + "logps/chosen": -47.88037872314453, + "logps/rejected": -57.69328308105469, + "loss": 0.6927, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.0021576883736997843, + "rewards/margins": -0.004288875963538885, + "rewards/rejected": 0.006446564570069313, + "step": 18 + }, + { + "epoch": 0.03, + "learning_rate": 1.4393939393939395e-07, + "logits/chosen": -1.1813757419586182, + "logits/rejected": -1.181291103363037, + "logps/chosen": -45.04245376586914, + "logps/rejected": -59.69392776489258, + "loss": 0.6924, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.0029491903260350227, + "rewards/margins": 0.003839898156002164, + "rewards/rejected": -0.0008907080627977848, + "step": 19 + }, + { + "epoch": 0.03, + "learning_rate": 1.5151515151515152e-07, + "logits/chosen": -1.2665643692016602, + "logits/rejected": -1.3037891387939453, + "logps/chosen": -55.89977264404297, + "logps/rejected": -67.01914978027344, + "loss": 0.6908, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.01057122927159071, + "rewards/margins": 0.008913875557482243, + "rewards/rejected": 0.0016573548782616854, + "step": 20 + }, + { + "epoch": 0.03, + "learning_rate": 1.5909090909090907e-07, + "logits/chosen": -1.468496322631836, + "logits/rejected": -1.4428430795669556, + "logps/chosen": -60.451072692871094, + "logps/rejected": -83.61201477050781, + "loss": 0.6912, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.01200780924409628, + "rewards/margins": 0.004002022091299295, + "rewards/rejected": 0.008005785755813122, + "step": 21 + }, + { + "epoch": 0.03, + "learning_rate": 1.6666666666666665e-07, + "logits/chosen": -1.2354681491851807, + "logits/rejected": -1.279544711112976, + "logps/chosen": -64.74099731445312, + "logps/rejected": -64.46685791015625, + "loss": 0.6939, + "rewards/accuracies": 0.5625, + "rewards/chosen": 0.0025492431595921516, + "rewards/margins": 9.844335727393627e-05, + "rewards/rejected": 0.002450800035148859, + "step": 22 + }, + { + "epoch": 0.03, + "learning_rate": 1.7424242424242425e-07, + "logits/chosen": -1.4172277450561523, + "logits/rejected": -1.4344230890274048, + "logps/chosen": -65.05791473388672, + "logps/rejected": -69.5328140258789, + "loss": 0.6918, + "rewards/accuracies": 0.4375, + "rewards/chosen": 0.004820013418793678, + "rewards/margins": 0.0006602166686207056, + "rewards/rejected": 0.004159796051681042, + "step": 23 + }, + { + "epoch": 0.04, + "learning_rate": 1.818181818181818e-07, + "logits/chosen": -1.2062236070632935, + "logits/rejected": -1.2450206279754639, + "logps/chosen": -50.588783264160156, + "logps/rejected": -53.00873565673828, + "loss": 0.6918, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.000991773558780551, + "rewards/margins": 0.0025939582847058773, + "rewards/rejected": -0.0016021848423406482, + "step": 24 + }, + { + "epoch": 0.04, + "learning_rate": 1.8939393939393938e-07, + "logits/chosen": -1.419810175895691, + "logits/rejected": -1.4961423873901367, + "logps/chosen": -64.14460754394531, + "logps/rejected": -82.07677459716797, + "loss": 0.692, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.011395109817385674, + "rewards/margins": 0.0041154976934194565, + "rewards/rejected": 0.007279610726982355, + "step": 25 + }, + { + "epoch": 0.04, + "learning_rate": 1.9696969696969696e-07, + "logits/chosen": -1.3785347938537598, + "logits/rejected": -1.447749376296997, + "logps/chosen": -55.51921844482422, + "logps/rejected": -60.01416778564453, + "loss": 0.6921, + "rewards/accuracies": 0.5625, + "rewards/chosen": 0.012294232845306396, + "rewards/margins": 0.0012017728295177221, + "rewards/rejected": 0.01109245978295803, + "step": 26 + }, + { + "epoch": 0.04, + "learning_rate": 2.0454545454545456e-07, + "logits/chosen": -1.3943463563919067, + "logits/rejected": -1.5082389116287231, + "logps/chosen": -63.7392578125, + "logps/rejected": -64.67830657958984, + "loss": 0.6895, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.015080487355589867, + "rewards/margins": 0.004277646541595459, + "rewards/rejected": 0.010802840813994408, + "step": 27 + }, + { + "epoch": 0.04, + "learning_rate": 2.121212121212121e-07, + "logits/chosen": -1.3473079204559326, + "logits/rejected": -1.3881864547729492, + "logps/chosen": -62.74684143066406, + "logps/rejected": -65.70709228515625, + "loss": 0.6891, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.01610391214489937, + "rewards/margins": 0.006725156679749489, + "rewards/rejected": 0.00937875546514988, + "step": 28 + }, + { + "epoch": 0.04, + "learning_rate": 2.1969696969696968e-07, + "logits/chosen": -1.2148323059082031, + "logits/rejected": -1.2324485778808594, + "logps/chosen": -52.017539978027344, + "logps/rejected": -69.51564025878906, + "loss": 0.6887, + "rewards/accuracies": 0.5625, + "rewards/chosen": 0.011382735334336758, + "rewards/margins": -0.00423963088542223, + "rewards/rejected": 0.015622366219758987, + "step": 29 + }, + { + "epoch": 0.05, + "learning_rate": 2.2727272727272726e-07, + "logits/chosen": -1.3029567003250122, + "logits/rejected": -1.3896591663360596, + "logps/chosen": -63.54433059692383, + "logps/rejected": -73.599365234375, + "loss": 0.6866, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.030646946281194687, + "rewards/margins": 0.020649565383791924, + "rewards/rejected": 0.009997379966080189, + "step": 30 + }, + { + "epoch": 0.05, + "learning_rate": 2.3484848484848486e-07, + "logits/chosen": -1.0819051265716553, + "logits/rejected": -1.1202231645584106, + "logps/chosen": -49.7504997253418, + "logps/rejected": -48.87813949584961, + "loss": 0.6898, + "rewards/accuracies": 0.5625, + "rewards/chosen": 0.003466451307758689, + "rewards/margins": 0.0005712390411645174, + "rewards/rejected": 0.002895212033763528, + "step": 31 + }, + { + "epoch": 0.05, + "learning_rate": 2.4242424242424244e-07, + "logits/chosen": -1.436554193496704, + "logits/rejected": -1.4304763078689575, + "logps/chosen": -66.21983337402344, + "logps/rejected": -80.84318542480469, + "loss": 0.686, + "rewards/accuracies": 0.5625, + "rewards/chosen": 0.04641828313469887, + "rewards/margins": 0.012152241542935371, + "rewards/rejected": 0.034266043454408646, + "step": 32 + }, + { + "epoch": 0.05, + "learning_rate": 2.5e-07, + "logits/chosen": -1.416285753250122, + "logits/rejected": -1.388656497001648, + "logps/chosen": -47.95892333984375, + "logps/rejected": -57.33323287963867, + "loss": 0.6873, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.021166132763028145, + "rewards/margins": 0.012613797560334206, + "rewards/rejected": 0.008552337065339088, + "step": 33 + }, + { + "epoch": 0.05, + "learning_rate": 2.5757575757575754e-07, + "logits/chosen": -1.4820764064788818, + "logits/rejected": -1.4750490188598633, + "logps/chosen": -57.51579284667969, + "logps/rejected": -61.07271194458008, + "loss": 0.6807, + "rewards/accuracies": 0.8125, + "rewards/chosen": 0.04040107876062393, + "rewards/margins": 0.02762734889984131, + "rewards/rejected": 0.012773728929460049, + "step": 34 + }, + { + "epoch": 0.05, + "learning_rate": 2.6515151515151514e-07, + "logits/chosen": -1.3491145372390747, + "logits/rejected": -1.3379892110824585, + "logps/chosen": -54.85727310180664, + "logps/rejected": -59.90010070800781, + "loss": 0.6894, + "rewards/accuracies": 0.5625, + "rewards/chosen": 0.030424263328313828, + "rewards/margins": 0.008832884952425957, + "rewards/rejected": 0.02159137651324272, + "step": 35 + }, + { + "epoch": 0.05, + "learning_rate": 2.727272727272727e-07, + "logits/chosen": -1.4148902893066406, + "logits/rejected": -1.3810207843780518, + "logps/chosen": -45.91991424560547, + "logps/rejected": -46.03016662597656, + "loss": 0.6874, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.03214998543262482, + "rewards/margins": 0.01864071935415268, + "rewards/rejected": 0.013509261421859264, + "step": 36 + }, + { + "epoch": 0.06, + "learning_rate": 2.8030303030303024e-07, + "logits/chosen": -1.4646170139312744, + "logits/rejected": -1.4692356586456299, + "logps/chosen": -61.599639892578125, + "logps/rejected": -69.2748794555664, + "loss": 0.6861, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.05117490142583847, + "rewards/margins": 0.03509356081485748, + "rewards/rejected": 0.016081344336271286, + "step": 37 + }, + { + "epoch": 0.06, + "learning_rate": 2.878787878787879e-07, + "logits/chosen": -1.5223915576934814, + "logits/rejected": -1.633446216583252, + "logps/chosen": -54.41788101196289, + "logps/rejected": -71.25310516357422, + "loss": 0.6828, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.054975785315036774, + "rewards/margins": 0.015454876236617565, + "rewards/rejected": 0.03952091187238693, + "step": 38 + }, + { + "epoch": 0.06, + "learning_rate": 2.9545454545454545e-07, + "logits/chosen": -1.29225754737854, + "logits/rejected": -1.3325889110565186, + "logps/chosen": -60.84377670288086, + "logps/rejected": -72.92223358154297, + "loss": 0.6801, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.06242671236395836, + "rewards/margins": 0.02597186714410782, + "rewards/rejected": 0.03645484521985054, + "step": 39 + }, + { + "epoch": 0.06, + "learning_rate": 3.0303030303030305e-07, + "logits/chosen": -1.478752851486206, + "logits/rejected": -1.4469019174575806, + "logps/chosen": -58.46867370605469, + "logps/rejected": -72.66609191894531, + "loss": 0.677, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.08628889918327332, + "rewards/margins": 0.0161750428378582, + "rewards/rejected": 0.07011385262012482, + "step": 40 + }, + { + "epoch": 0.06, + "learning_rate": 3.106060606060606e-07, + "logits/chosen": -1.3904973268508911, + "logits/rejected": -1.4015754461288452, + "logps/chosen": -54.29767990112305, + "logps/rejected": -68.59740447998047, + "loss": 0.6778, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.0754782184958458, + "rewards/margins": 0.036283429712057114, + "rewards/rejected": 0.03919479250907898, + "step": 41 + }, + { + "epoch": 0.06, + "learning_rate": 3.1818181818181815e-07, + "logits/chosen": -1.5571492910385132, + "logits/rejected": -1.5671147108078003, + "logps/chosen": -55.728492736816406, + "logps/rejected": -67.99396514892578, + "loss": 0.6832, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.06205673888325691, + "rewards/margins": -0.0022810909431427717, + "rewards/rejected": 0.0643378272652626, + "step": 42 + }, + { + "epoch": 0.07, + "learning_rate": 3.2575757575757575e-07, + "logits/chosen": -1.3594117164611816, + "logits/rejected": -1.3907921314239502, + "logps/chosen": -58.898040771484375, + "logps/rejected": -68.76325988769531, + "loss": 0.6672, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.07264361530542374, + "rewards/margins": 0.025914786383509636, + "rewards/rejected": 0.04672882333397865, + "step": 43 + }, + { + "epoch": 0.07, + "learning_rate": 3.333333333333333e-07, + "logits/chosen": -1.4709270000457764, + "logits/rejected": -1.5028572082519531, + "logps/chosen": -62.69089889526367, + "logps/rejected": -77.4356918334961, + "loss": 0.6712, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.08955918252468109, + "rewards/margins": 0.03507918864488602, + "rewards/rejected": 0.054479993879795074, + "step": 44 + }, + { + "epoch": 0.07, + "learning_rate": 3.4090909090909085e-07, + "logits/chosen": -1.2753313779830933, + "logits/rejected": -1.3156425952911377, + "logps/chosen": -49.9275016784668, + "logps/rejected": -63.02009582519531, + "loss": 0.6723, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.07214108854532242, + "rewards/margins": 0.01590968668460846, + "rewards/rejected": 0.05623140186071396, + "step": 45 + }, + { + "epoch": 0.07, + "learning_rate": 3.484848484848485e-07, + "logits/chosen": -1.5745067596435547, + "logits/rejected": -1.5774452686309814, + "logps/chosen": -67.59754943847656, + "logps/rejected": -87.30855560302734, + "loss": 0.6665, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.12661635875701904, + "rewards/margins": 0.0753166526556015, + "rewards/rejected": 0.051299698650836945, + "step": 46 + }, + { + "epoch": 0.07, + "learning_rate": 3.5606060606060606e-07, + "logits/chosen": -1.285461664199829, + "logits/rejected": -1.3261513710021973, + "logps/chosen": -44.92738342285156, + "logps/rejected": -51.15544891357422, + "loss": 0.6695, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.08344868570566177, + "rewards/margins": 0.04822618141770363, + "rewards/rejected": 0.035222504287958145, + "step": 47 + }, + { + "epoch": 0.07, + "learning_rate": 3.636363636363636e-07, + "logits/chosen": -1.2779335975646973, + "logits/rejected": -1.413496494293213, + "logps/chosen": -55.0052490234375, + "logps/rejected": -67.77876281738281, + "loss": 0.6682, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.14483140408992767, + "rewards/margins": 0.09184764325618744, + "rewards/rejected": 0.052983760833740234, + "step": 48 + }, + { + "epoch": 0.07, + "learning_rate": 3.712121212121212e-07, + "logits/chosen": -1.3702609539031982, + "logits/rejected": -1.4781056642532349, + "logps/chosen": -77.20408630371094, + "logps/rejected": -74.00489807128906, + "loss": 0.6635, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.14278045296669006, + "rewards/margins": 0.061780668795108795, + "rewards/rejected": 0.08099978417158127, + "step": 49 + }, + { + "epoch": 0.08, + "learning_rate": 3.7878787878787876e-07, + "logits/chosen": -1.3604373931884766, + "logits/rejected": -1.378310203552246, + "logps/chosen": -58.75284194946289, + "logps/rejected": -82.84333038330078, + "loss": 0.6589, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.19303418695926666, + "rewards/margins": 0.09257876873016357, + "rewards/rejected": 0.10045541822910309, + "step": 50 + }, + { + "epoch": 0.08, + "learning_rate": 3.8636363636363636e-07, + "logits/chosen": -1.3076860904693604, + "logits/rejected": -1.4157476425170898, + "logps/chosen": -47.252445220947266, + "logps/rejected": -48.10454559326172, + "loss": 0.6674, + "rewards/accuracies": 0.8125, + "rewards/chosen": 0.11059942096471786, + "rewards/margins": 0.06811001151800156, + "rewards/rejected": 0.04248940199613571, + "step": 51 + }, + { + "epoch": 0.08, + "learning_rate": 3.939393939393939e-07, + "logits/chosen": -1.503462791442871, + "logits/rejected": -1.5425465106964111, + "logps/chosen": -60.388336181640625, + "logps/rejected": -70.0484619140625, + "loss": 0.6614, + "rewards/accuracies": 0.5625, + "rewards/chosen": 0.16037140786647797, + "rewards/margins": 0.019282342866063118, + "rewards/rejected": 0.1410890817642212, + "step": 52 + }, + { + "epoch": 0.08, + "learning_rate": 4.0151515151515146e-07, + "logits/chosen": -1.3380749225616455, + "logits/rejected": -1.3209658861160278, + "logps/chosen": -52.98939514160156, + "logps/rejected": -68.4948501586914, + "loss": 0.6593, + "rewards/accuracies": 0.5625, + "rewards/chosen": 0.19923309981822968, + "rewards/margins": 0.01286761462688446, + "rewards/rejected": 0.18636548519134521, + "step": 53 + }, + { + "epoch": 0.08, + "learning_rate": 4.090909090909091e-07, + "logits/chosen": -1.282020092010498, + "logits/rejected": -1.3592822551727295, + "logps/chosen": -48.89101028442383, + "logps/rejected": -62.19300079345703, + "loss": 0.6654, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.17862869799137115, + "rewards/margins": 0.048022858798503876, + "rewards/rejected": 0.13060584664344788, + "step": 54 + }, + { + "epoch": 0.08, + "learning_rate": 4.1666666666666667e-07, + "logits/chosen": -1.3051087856292725, + "logits/rejected": -1.3205571174621582, + "logps/chosen": -59.4260368347168, + "logps/rejected": -80.87752532958984, + "loss": 0.6517, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.21322180330753326, + "rewards/margins": 0.056215398013591766, + "rewards/rejected": 0.1570064127445221, + "step": 55 + }, + { + "epoch": 0.09, + "learning_rate": 4.242424242424242e-07, + "logits/chosen": -1.3351942300796509, + "logits/rejected": -1.404450535774231, + "logps/chosen": -59.53591537475586, + "logps/rejected": -56.089290618896484, + "loss": 0.6681, + "rewards/accuracies": 0.5625, + "rewards/chosen": 0.12059932202100754, + "rewards/margins": 0.06928794831037521, + "rewards/rejected": 0.051311373710632324, + "step": 56 + }, + { + "epoch": 0.09, + "learning_rate": 4.318181818181818e-07, + "logits/chosen": -1.3024204969406128, + "logits/rejected": -1.389835238456726, + "logps/chosen": -53.57666778564453, + "logps/rejected": -51.41597366333008, + "loss": 0.6459, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.16042464971542358, + "rewards/margins": 0.10563376545906067, + "rewards/rejected": 0.05479089915752411, + "step": 57 + }, + { + "epoch": 0.09, + "learning_rate": 4.3939393939393937e-07, + "logits/chosen": -1.2929675579071045, + "logits/rejected": -1.3011419773101807, + "logps/chosen": -53.317405700683594, + "logps/rejected": -68.7337875366211, + "loss": 0.6455, + "rewards/accuracies": 0.4375, + "rewards/chosen": 0.1711544543504715, + "rewards/margins": 0.006437983829528093, + "rewards/rejected": 0.16471648216247559, + "step": 58 + }, + { + "epoch": 0.09, + "learning_rate": 4.469696969696969e-07, + "logits/chosen": -1.5048664808273315, + "logits/rejected": -1.5094908475875854, + "logps/chosen": -64.76486206054688, + "logps/rejected": -78.0581283569336, + "loss": 0.641, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.24148711562156677, + "rewards/margins": 0.10250819474458694, + "rewards/rejected": 0.13897892832756042, + "step": 59 + }, + { + "epoch": 0.09, + "learning_rate": 4.545454545454545e-07, + "logits/chosen": -1.3591467142105103, + "logits/rejected": -1.4223358631134033, + "logps/chosen": -52.74089813232422, + "logps/rejected": -76.95367431640625, + "loss": 0.642, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.2069980502128601, + "rewards/margins": 0.2083958089351654, + "rewards/rejected": -0.001397751271724701, + "step": 60 + }, + { + "epoch": 0.09, + "learning_rate": 4.6212121212121207e-07, + "logits/chosen": -1.4500656127929688, + "logits/rejected": -1.4579286575317383, + "logps/chosen": -61.141475677490234, + "logps/rejected": -67.77528381347656, + "loss": 0.6357, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.1464468240737915, + "rewards/margins": 0.12135310471057892, + "rewards/rejected": 0.02509371004998684, + "step": 61 + }, + { + "epoch": 0.09, + "learning_rate": 4.696969696969697e-07, + "logits/chosen": -1.3917747735977173, + "logits/rejected": -1.3550325632095337, + "logps/chosen": -52.82176971435547, + "logps/rejected": -73.30123138427734, + "loss": 0.6488, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.14380736649036407, + "rewards/margins": 0.0667564794421196, + "rewards/rejected": 0.07705089449882507, + "step": 62 + }, + { + "epoch": 0.1, + "learning_rate": 4.772727272727273e-07, + "logits/chosen": -1.5208649635314941, + "logits/rejected": -1.4892252683639526, + "logps/chosen": -55.34352111816406, + "logps/rejected": -67.55403137207031, + "loss": 0.6419, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.27572011947631836, + "rewards/margins": 0.14066900312900543, + "rewards/rejected": 0.13505114614963531, + "step": 63 + }, + { + "epoch": 0.1, + "learning_rate": 4.848484848484849e-07, + "logits/chosen": -1.4660046100616455, + "logits/rejected": -1.6109917163848877, + "logps/chosen": -62.14973449707031, + "logps/rejected": -81.52274322509766, + "loss": 0.6379, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.20028699934482574, + "rewards/margins": 0.19850493967533112, + "rewards/rejected": 0.001782064326107502, + "step": 64 + }, + { + "epoch": 0.1, + "learning_rate": 4.924242424242424e-07, + "logits/chosen": -1.290839672088623, + "logits/rejected": -1.2918977737426758, + "logps/chosen": -62.30453872680664, + "logps/rejected": -71.33212280273438, + "loss": 0.6337, + "rewards/accuracies": 0.5625, + "rewards/chosen": 0.1621263325214386, + "rewards/margins": 0.04839283600449562, + "rewards/rejected": 0.11373350024223328, + "step": 65 + }, + { + "epoch": 0.1, + "learning_rate": 5e-07, + "logits/chosen": -1.2274879217147827, + "logits/rejected": -1.2406153678894043, + "logps/chosen": -57.76105499267578, + "logps/rejected": -83.968994140625, + "loss": 0.6177, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.23292019963264465, + "rewards/margins": 0.2149038016796112, + "rewards/rejected": 0.018016403540968895, + "step": 66 + }, + { + "epoch": 0.1, + "learning_rate": 5.075757575757576e-07, + "logits/chosen": -1.1510586738586426, + "logits/rejected": -1.186632513999939, + "logps/chosen": -54.71442413330078, + "logps/rejected": -70.42817687988281, + "loss": 0.6217, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.09175221621990204, + "rewards/margins": -0.007499314844608307, + "rewards/rejected": 0.09925152361392975, + "step": 67 + }, + { + "epoch": 0.1, + "learning_rate": 5.151515151515151e-07, + "logits/chosen": -1.2038687467575073, + "logits/rejected": -1.2474325895309448, + "logps/chosen": -61.707618713378906, + "logps/rejected": -67.5180892944336, + "loss": 0.6296, + "rewards/accuracies": 0.8125, + "rewards/chosen": 0.11510595679283142, + "rewards/margins": 0.09858663380146027, + "rewards/rejected": 0.016519328579306602, + "step": 68 + }, + { + "epoch": 0.1, + "learning_rate": 5.227272727272727e-07, + "logits/chosen": -1.5192761421203613, + "logits/rejected": -1.531144618988037, + "logps/chosen": -60.95773696899414, + "logps/rejected": -63.483619689941406, + "loss": 0.6137, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.1562405526638031, + "rewards/margins": 0.13872182369232178, + "rewards/rejected": 0.01751871407032013, + "step": 69 + }, + { + "epoch": 0.11, + "learning_rate": 5.303030303030303e-07, + "logits/chosen": -1.46200430393219, + "logits/rejected": -1.4206883907318115, + "logps/chosen": -53.62306594848633, + "logps/rejected": -73.84425354003906, + "loss": 0.6008, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.08211082965135574, + "rewards/margins": 0.17571085691452026, + "rewards/rejected": -0.09360002726316452, + "step": 70 + }, + { + "epoch": 0.11, + "learning_rate": 5.378787878787878e-07, + "logits/chosen": -1.3949741125106812, + "logits/rejected": -1.3646326065063477, + "logps/chosen": -50.04539108276367, + "logps/rejected": -71.26206970214844, + "loss": 0.6139, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.14944177865982056, + "rewards/margins": 0.17371398210525513, + "rewards/rejected": -0.02427222579717636, + "step": 71 + }, + { + "epoch": 0.11, + "learning_rate": 5.454545454545454e-07, + "logits/chosen": -1.2625566720962524, + "logits/rejected": -1.30435049533844, + "logps/chosen": -42.66155242919922, + "logps/rejected": -49.39188003540039, + "loss": 0.6212, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.04858655110001564, + "rewards/margins": 0.08578862994909286, + "rewards/rejected": -0.03720208257436752, + "step": 72 + }, + { + "epoch": 0.11, + "learning_rate": 5.53030303030303e-07, + "logits/chosen": -1.5173357725143433, + "logits/rejected": -1.532628059387207, + "logps/chosen": -67.30522918701172, + "logps/rejected": -71.20864868164062, + "loss": 0.592, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.016212619841098785, + "rewards/margins": 0.17678993940353394, + "rewards/rejected": -0.16057732701301575, + "step": 73 + }, + { + "epoch": 0.11, + "learning_rate": 5.606060606060605e-07, + "logits/chosen": -1.314016342163086, + "logits/rejected": -1.355907917022705, + "logps/chosen": -77.79779815673828, + "logps/rejected": -84.5552978515625, + "loss": 0.5903, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.0052999164909124374, + "rewards/margins": 0.1510595828294754, + "rewards/rejected": -0.1563594937324524, + "step": 74 + }, + { + "epoch": 0.11, + "learning_rate": 5.681818181818182e-07, + "logits/chosen": -1.2979378700256348, + "logits/rejected": -1.4032480716705322, + "logps/chosen": -54.55912780761719, + "logps/rejected": -66.57144165039062, + "loss": 0.6116, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.04821896553039551, + "rewards/margins": 0.2483121156692505, + "rewards/rejected": -0.2000931352376938, + "step": 75 + }, + { + "epoch": 0.12, + "learning_rate": 5.757575757575758e-07, + "logits/chosen": -1.4066622257232666, + "logits/rejected": -1.4482512474060059, + "logps/chosen": -60.11941909790039, + "logps/rejected": -67.46295928955078, + "loss": 0.6201, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.03395650535821915, + "rewards/margins": 0.29136669635772705, + "rewards/rejected": -0.2574101984500885, + "step": 76 + }, + { + "epoch": 0.12, + "learning_rate": 5.833333333333334e-07, + "logits/chosen": -1.3306366205215454, + "logits/rejected": -1.3925280570983887, + "logps/chosen": -52.87839889526367, + "logps/rejected": -55.65316390991211, + "loss": 0.6075, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.050175078213214874, + "rewards/margins": 0.07563228905200958, + "rewards/rejected": -0.12580737471580505, + "step": 77 + }, + { + "epoch": 0.12, + "learning_rate": 5.909090909090909e-07, + "logits/chosen": -1.249314546585083, + "logits/rejected": -1.247490644454956, + "logps/chosen": -54.777767181396484, + "logps/rejected": -61.90538024902344, + "loss": 0.5993, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.041874658316373825, + "rewards/margins": 0.25838491320610046, + "rewards/rejected": -0.21651026606559753, + "step": 78 + }, + { + "epoch": 0.12, + "learning_rate": 5.984848484848485e-07, + "logits/chosen": -1.4066355228424072, + "logits/rejected": -1.3875881433486938, + "logps/chosen": -55.322410583496094, + "logps/rejected": -82.0425033569336, + "loss": 0.5885, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.033765971660614014, + "rewards/margins": 0.2818932831287384, + "rewards/rejected": -0.3156592845916748, + "step": 79 + }, + { + "epoch": 0.12, + "learning_rate": 6.060606060606061e-07, + "logits/chosen": -1.3568596839904785, + "logits/rejected": -1.355711579322815, + "logps/chosen": -63.79819869995117, + "logps/rejected": -87.84752655029297, + "loss": 0.5669, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.0628647729754448, + "rewards/margins": 0.43628817796707153, + "rewards/rejected": -0.49915292859077454, + "step": 80 + }, + { + "epoch": 0.12, + "learning_rate": 6.136363636363636e-07, + "logits/chosen": -1.2920787334442139, + "logits/rejected": -1.2909367084503174, + "logps/chosen": -72.8036880493164, + "logps/rejected": -84.17581176757812, + "loss": 0.5831, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.06573189795017242, + "rewards/margins": 0.2303919941186905, + "rewards/rejected": -0.2961239218711853, + "step": 81 + }, + { + "epoch": 0.12, + "learning_rate": 6.212121212121212e-07, + "logits/chosen": -1.2058041095733643, + "logits/rejected": -1.2265359163284302, + "logps/chosen": -54.95501708984375, + "logps/rejected": -68.9476318359375, + "loss": 0.5531, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.1221294105052948, + "rewards/margins": 0.22339659929275513, + "rewards/rejected": -0.3455260097980499, + "step": 82 + }, + { + "epoch": 0.13, + "learning_rate": 6.287878787878788e-07, + "logits/chosen": -1.2530492544174194, + "logits/rejected": -1.2605494260787964, + "logps/chosen": -50.06599807739258, + "logps/rejected": -68.54713439941406, + "loss": 0.5978, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.0020802952349185944, + "rewards/margins": 0.3581036925315857, + "rewards/rejected": -0.3601840138435364, + "step": 83 + }, + { + "epoch": 0.13, + "learning_rate": 6.363636363636363e-07, + "logits/chosen": -1.2875895500183105, + "logits/rejected": -1.2904086112976074, + "logps/chosen": -63.59492111206055, + "logps/rejected": -92.8702163696289, + "loss": 0.5374, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.033722538501024246, + "rewards/margins": 0.7385894656181335, + "rewards/rejected": -0.7723120450973511, + "step": 84 + }, + { + "epoch": 0.13, + "learning_rate": 6.439393939393939e-07, + "logits/chosen": -1.319128155708313, + "logits/rejected": -1.2768744230270386, + "logps/chosen": -65.39422607421875, + "logps/rejected": -85.60679626464844, + "loss": 0.5433, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.2017182856798172, + "rewards/margins": 0.40415698289871216, + "rewards/rejected": -0.6058752536773682, + "step": 85 + }, + { + "epoch": 0.13, + "learning_rate": 6.515151515151515e-07, + "logits/chosen": -1.2891089916229248, + "logits/rejected": -1.3245259523391724, + "logps/chosen": -70.21843719482422, + "logps/rejected": -87.76618194580078, + "loss": 0.598, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.12800543010234833, + "rewards/margins": 0.27882272005081177, + "rewards/rejected": -0.4068281352519989, + "step": 86 + }, + { + "epoch": 0.13, + "learning_rate": 6.59090909090909e-07, + "logits/chosen": -1.4151794910430908, + "logits/rejected": -1.401626467704773, + "logps/chosen": -59.638519287109375, + "logps/rejected": -68.63182830810547, + "loss": 0.5671, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.18449069559574127, + "rewards/margins": 0.26218655705451965, + "rewards/rejected": -0.4466772973537445, + "step": 87 + }, + { + "epoch": 0.13, + "learning_rate": 6.666666666666666e-07, + "logits/chosen": -1.4876341819763184, + "logits/rejected": -1.5497742891311646, + "logps/chosen": -53.505035400390625, + "logps/rejected": -62.61443328857422, + "loss": 0.5365, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.03885147348046303, + "rewards/margins": 0.4145755469799042, + "rewards/rejected": -0.37572407722473145, + "step": 88 + }, + { + "epoch": 0.14, + "learning_rate": 6.742424242424242e-07, + "logits/chosen": -1.4708367586135864, + "logits/rejected": -1.3771567344665527, + "logps/chosen": -61.228538513183594, + "logps/rejected": -93.87561798095703, + "loss": 0.545, + "rewards/accuracies": 0.8125, + "rewards/chosen": 0.0057802870869636536, + "rewards/margins": 0.9394412040710449, + "rewards/rejected": -0.9336608648300171, + "step": 89 + }, + { + "epoch": 0.14, + "learning_rate": 6.818181818181817e-07, + "logits/chosen": -1.2658331394195557, + "logits/rejected": -1.2438105344772339, + "logps/chosen": -64.95509338378906, + "logps/rejected": -86.05477905273438, + "loss": 0.5244, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.18022720515727997, + "rewards/margins": 0.41284486651420593, + "rewards/rejected": -0.5930720567703247, + "step": 90 + }, + { + "epoch": 0.14, + "learning_rate": 6.893939393939394e-07, + "logits/chosen": -1.3612754344940186, + "logits/rejected": -1.4404058456420898, + "logps/chosen": -59.00148391723633, + "logps/rejected": -73.94998931884766, + "loss": 0.5279, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.15143747627735138, + "rewards/margins": 0.48363757133483887, + "rewards/rejected": -0.635075032711029, + "step": 91 + }, + { + "epoch": 0.14, + "learning_rate": 6.96969696969697e-07, + "logits/chosen": -1.200008511543274, + "logits/rejected": -1.2430647611618042, + "logps/chosen": -67.3596420288086, + "logps/rejected": -78.98017883300781, + "loss": 0.5235, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.24770724773406982, + "rewards/margins": 0.5027315020561218, + "rewards/rejected": -0.750438928604126, + "step": 92 + }, + { + "epoch": 0.14, + "learning_rate": 7.045454545454545e-07, + "logits/chosen": -1.2663078308105469, + "logits/rejected": -1.3646279573440552, + "logps/chosen": -56.52938461303711, + "logps/rejected": -52.08761215209961, + "loss": 0.5377, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.09041959047317505, + "rewards/margins": 0.3694082498550415, + "rewards/rejected": -0.45982789993286133, + "step": 93 + }, + { + "epoch": 0.14, + "learning_rate": 7.121212121212121e-07, + "logits/chosen": -1.483896017074585, + "logits/rejected": -1.491857647895813, + "logps/chosen": -73.71580505371094, + "logps/rejected": -99.14268493652344, + "loss": 0.5347, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.40936753153800964, + "rewards/margins": 0.4666964113712311, + "rewards/rejected": -0.876063883304596, + "step": 94 + }, + { + "epoch": 0.14, + "learning_rate": 7.196969696969697e-07, + "logits/chosen": -1.2688690423965454, + "logits/rejected": -1.2628036737442017, + "logps/chosen": -54.340267181396484, + "logps/rejected": -68.0820083618164, + "loss": 0.5163, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.13473013043403625, + "rewards/margins": 0.5201348662376404, + "rewards/rejected": -0.654865026473999, + "step": 95 + }, + { + "epoch": 0.15, + "learning_rate": 7.272727272727272e-07, + "logits/chosen": -1.4142446517944336, + "logits/rejected": -1.478228211402893, + "logps/chosen": -52.192970275878906, + "logps/rejected": -60.232112884521484, + "loss": 0.5442, + "rewards/accuracies": 0.5625, + "rewards/chosen": 0.002001902088522911, + "rewards/margins": 0.3312196135520935, + "rewards/rejected": -0.32921773195266724, + "step": 96 + }, + { + "epoch": 0.15, + "learning_rate": 7.348484848484848e-07, + "logits/chosen": -1.3240199089050293, + "logits/rejected": -1.3259228467941284, + "logps/chosen": -55.829490661621094, + "logps/rejected": -55.56840896606445, + "loss": 0.5668, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.15988041460514069, + "rewards/margins": 0.2161427140235901, + "rewards/rejected": -0.3760231137275696, + "step": 97 + }, + { + "epoch": 0.15, + "learning_rate": 7.424242424242424e-07, + "logits/chosen": -1.319766879081726, + "logits/rejected": -1.4090697765350342, + "logps/chosen": -73.19944763183594, + "logps/rejected": -93.69942474365234, + "loss": 0.5177, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.215353861451149, + "rewards/margins": 0.9013051390647888, + "rewards/rejected": -1.1166590452194214, + "step": 98 + }, + { + "epoch": 0.15, + "learning_rate": 7.5e-07, + "logits/chosen": -1.4314652681350708, + "logits/rejected": -1.3855012655258179, + "logps/chosen": -48.45336151123047, + "logps/rejected": -68.33036041259766, + "loss": 0.5303, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.10106388479471207, + "rewards/margins": 0.4430334270000458, + "rewards/rejected": -0.5440973043441772, + "step": 99 + }, + { + "epoch": 0.15, + "learning_rate": 7.575757575757575e-07, + "logits/chosen": -1.2473658323287964, + "logits/rejected": -1.2665302753448486, + "logps/chosen": -55.45205307006836, + "logps/rejected": -78.337158203125, + "loss": 0.4852, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.011083535850048065, + "rewards/margins": 0.746026337146759, + "rewards/rejected": -0.7571098208427429, + "step": 100 + }, + { + "epoch": 0.15, + "learning_rate": 7.651515151515151e-07, + "logits/chosen": -1.236082911491394, + "logits/rejected": -1.2414617538452148, + "logps/chosen": -53.64671325683594, + "logps/rejected": -63.89655685424805, + "loss": 0.4884, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.08944252133369446, + "rewards/margins": 0.36017340421676636, + "rewards/rejected": -0.4496158957481384, + "step": 101 + }, + { + "epoch": 0.15, + "learning_rate": 7.727272727272727e-07, + "logits/chosen": -1.2842726707458496, + "logits/rejected": -1.2782001495361328, + "logps/chosen": -60.47846221923828, + "logps/rejected": -79.14984130859375, + "loss": 0.5645, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.32786887884140015, + "rewards/margins": 0.5022405982017517, + "rewards/rejected": -0.8301095366477966, + "step": 102 + }, + { + "epoch": 0.16, + "learning_rate": 7.803030303030302e-07, + "logits/chosen": -1.2620081901550293, + "logits/rejected": -1.2633405923843384, + "logps/chosen": -55.8270263671875, + "logps/rejected": -70.67143249511719, + "loss": 0.4981, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.3648158013820648, + "rewards/margins": 0.2358836680650711, + "rewards/rejected": -0.6006994843482971, + "step": 103 + }, + { + "epoch": 0.16, + "learning_rate": 7.878787878787878e-07, + "logits/chosen": -1.277204990386963, + "logits/rejected": -1.1961885690689087, + "logps/chosen": -82.07122039794922, + "logps/rejected": -129.94085693359375, + "loss": 0.4846, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.860465407371521, + "rewards/margins": 1.5867725610733032, + "rewards/rejected": -2.447237730026245, + "step": 104 + }, + { + "epoch": 0.16, + "learning_rate": 7.954545454545454e-07, + "logits/chosen": -1.3882551193237305, + "logits/rejected": -1.36410391330719, + "logps/chosen": -65.95565795898438, + "logps/rejected": -90.56163787841797, + "loss": 0.5046, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.6373096704483032, + "rewards/margins": 0.6799200177192688, + "rewards/rejected": -1.3172297477722168, + "step": 105 + }, + { + "epoch": 0.16, + "learning_rate": 8.030303030303029e-07, + "logits/chosen": -1.3937575817108154, + "logits/rejected": -1.3328791856765747, + "logps/chosen": -57.98670959472656, + "logps/rejected": -79.21855926513672, + "loss": 0.4929, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.134065181016922, + "rewards/margins": 0.8324288725852966, + "rewards/rejected": -0.966494083404541, + "step": 106 + }, + { + "epoch": 0.16, + "learning_rate": 8.106060606060605e-07, + "logits/chosen": -1.4467005729675293, + "logits/rejected": -1.4381572008132935, + "logps/chosen": -55.658119201660156, + "logps/rejected": -83.31585693359375, + "loss": 0.4194, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.19678352773189545, + "rewards/margins": 0.6790809035301208, + "rewards/rejected": -0.8758644461631775, + "step": 107 + }, + { + "epoch": 0.16, + "learning_rate": 8.181818181818182e-07, + "logits/chosen": -1.5325909852981567, + "logits/rejected": -1.5394513607025146, + "logps/chosen": -65.08736419677734, + "logps/rejected": -81.30486297607422, + "loss": 0.48, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.41833633184432983, + "rewards/margins": 0.7128567099571228, + "rewards/rejected": -1.1311931610107422, + "step": 108 + }, + { + "epoch": 0.17, + "learning_rate": 8.257575757575757e-07, + "logits/chosen": -1.427512526512146, + "logits/rejected": -1.4086472988128662, + "logps/chosen": -54.339439392089844, + "logps/rejected": -65.26164245605469, + "loss": 0.4509, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.36659955978393555, + "rewards/margins": 0.5181149840354919, + "rewards/rejected": -0.8847146034240723, + "step": 109 + }, + { + "epoch": 0.17, + "learning_rate": 8.333333333333333e-07, + "logits/chosen": -1.4420437812805176, + "logits/rejected": -1.508358120918274, + "logps/chosen": -61.45826721191406, + "logps/rejected": -67.23968505859375, + "loss": 0.5122, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.21704250574111938, + "rewards/margins": 0.7157589197158813, + "rewards/rejected": -0.9328014850616455, + "step": 110 + }, + { + "epoch": 0.17, + "learning_rate": 8.409090909090909e-07, + "logits/chosen": -1.3607494831085205, + "logits/rejected": -1.3761709928512573, + "logps/chosen": -69.81553649902344, + "logps/rejected": -89.93431091308594, + "loss": 0.5065, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.39086639881134033, + "rewards/margins": 0.9802790880203247, + "rewards/rejected": -1.3711453676223755, + "step": 111 + }, + { + "epoch": 0.17, + "learning_rate": 8.484848484848484e-07, + "logits/chosen": -1.4498722553253174, + "logits/rejected": -1.448058843612671, + "logps/chosen": -74.12974548339844, + "logps/rejected": -102.22052764892578, + "loss": 0.4436, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.5492932796478271, + "rewards/margins": 1.652042031288147, + "rewards/rejected": -2.2013354301452637, + "step": 112 + }, + { + "epoch": 0.17, + "learning_rate": 8.56060606060606e-07, + "logits/chosen": -1.2922266721725464, + "logits/rejected": -1.374589204788208, + "logps/chosen": -63.6357421875, + "logps/rejected": -59.48933029174805, + "loss": 0.5046, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.13962438702583313, + "rewards/margins": 0.5601056814193726, + "rewards/rejected": -0.6997300982475281, + "step": 113 + }, + { + "epoch": 0.17, + "learning_rate": 8.636363636363636e-07, + "logits/chosen": -1.510749101638794, + "logits/rejected": -1.550467848777771, + "logps/chosen": -81.38545227050781, + "logps/rejected": -88.50811004638672, + "loss": 0.451, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.6597554087638855, + "rewards/margins": 1.0560970306396484, + "rewards/rejected": -1.7158524990081787, + "step": 114 + }, + { + "epoch": 0.17, + "learning_rate": 8.712121212121211e-07, + "logits/chosen": -1.3277736902236938, + "logits/rejected": -1.3583914041519165, + "logps/chosen": -78.43363952636719, + "logps/rejected": -99.82984924316406, + "loss": 0.5054, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.9456002712249756, + "rewards/margins": 1.1029608249664307, + "rewards/rejected": -2.0485610961914062, + "step": 115 + }, + { + "epoch": 0.18, + "learning_rate": 8.787878787878787e-07, + "logits/chosen": -1.2548108100891113, + "logits/rejected": -1.3178308010101318, + "logps/chosen": -62.914791107177734, + "logps/rejected": -77.83401489257812, + "loss": 0.4835, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.17691561579704285, + "rewards/margins": 1.2272669076919556, + "rewards/rejected": -1.4041826725006104, + "step": 116 + }, + { + "epoch": 0.18, + "learning_rate": 8.863636363636363e-07, + "logits/chosen": -1.2925139665603638, + "logits/rejected": -1.2939532995224, + "logps/chosen": -71.66761779785156, + "logps/rejected": -99.22976684570312, + "loss": 0.4363, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.5545389652252197, + "rewards/margins": 1.1594491004943848, + "rewards/rejected": -1.713987946510315, + "step": 117 + }, + { + "epoch": 0.18, + "learning_rate": 8.939393939393938e-07, + "logits/chosen": -1.3311680555343628, + "logits/rejected": -1.3106731176376343, + "logps/chosen": -86.64567565917969, + "logps/rejected": -128.3566436767578, + "loss": 0.4361, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.0433001518249512, + "rewards/margins": 1.8462791442871094, + "rewards/rejected": -2.8895792961120605, + "step": 118 + }, + { + "epoch": 0.18, + "learning_rate": 9.015151515151514e-07, + "logits/chosen": -1.1761585474014282, + "logits/rejected": -1.1626079082489014, + "logps/chosen": -83.92056274414062, + "logps/rejected": -114.5244140625, + "loss": 0.4723, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.011547327041626, + "rewards/margins": 1.5983021259307861, + "rewards/rejected": -2.609849691390991, + "step": 119 + }, + { + "epoch": 0.18, + "learning_rate": 9.09090909090909e-07, + "logits/chosen": -1.3708627223968506, + "logits/rejected": -1.380934715270996, + "logps/chosen": -51.2702751159668, + "logps/rejected": -69.57611083984375, + "loss": 0.4129, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.15028327703475952, + "rewards/margins": 1.162183165550232, + "rewards/rejected": -1.0118999481201172, + "step": 120 + }, + { + "epoch": 0.18, + "learning_rate": 9.166666666666665e-07, + "logits/chosen": -1.3708436489105225, + "logits/rejected": -1.4203503131866455, + "logps/chosen": -84.83291625976562, + "logps/rejected": -139.04513549804688, + "loss": 0.3989, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.1144614219665527, + "rewards/margins": 2.0489420890808105, + "rewards/rejected": -3.1634035110473633, + "step": 121 + }, + { + "epoch": 0.19, + "learning_rate": 9.242424242424241e-07, + "logits/chosen": -1.3400905132293701, + "logits/rejected": -1.424918532371521, + "logps/chosen": -82.42213439941406, + "logps/rejected": -95.20804595947266, + "loss": 0.5044, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.1995664834976196, + "rewards/margins": 1.1138718128204346, + "rewards/rejected": -2.3134384155273438, + "step": 122 + }, + { + "epoch": 0.19, + "learning_rate": 9.318181818181817e-07, + "logits/chosen": -1.2418166399002075, + "logits/rejected": -1.2201536893844604, + "logps/chosen": -46.085845947265625, + "logps/rejected": -55.85955810546875, + "loss": 0.4017, + "rewards/accuracies": 0.8125, + "rewards/chosen": 0.09070870280265808, + "rewards/margins": 0.5683165192604065, + "rewards/rejected": -0.477607786655426, + "step": 123 + }, + { + "epoch": 0.19, + "learning_rate": 9.393939393939395e-07, + "logits/chosen": -1.3297370672225952, + "logits/rejected": -1.3739547729492188, + "logps/chosen": -66.36666107177734, + "logps/rejected": -73.92923736572266, + "loss": 0.4433, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.41327372193336487, + "rewards/margins": 0.7713428735733032, + "rewards/rejected": -1.1846165657043457, + "step": 124 + }, + { + "epoch": 0.19, + "learning_rate": 9.46969696969697e-07, + "logits/chosen": -1.3880527019500732, + "logits/rejected": -1.376543402671814, + "logps/chosen": -65.552734375, + "logps/rejected": -87.94761657714844, + "loss": 0.4418, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.7102217674255371, + "rewards/margins": 1.1007181406021118, + "rewards/rejected": -1.810939908027649, + "step": 125 + }, + { + "epoch": 0.19, + "learning_rate": 9.545454545454546e-07, + "logits/chosen": -1.6728534698486328, + "logits/rejected": -1.6981301307678223, + "logps/chosen": -69.73593139648438, + "logps/rejected": -79.36700439453125, + "loss": 0.5261, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.649253785610199, + "rewards/margins": 0.627336859703064, + "rewards/rejected": -1.2765905857086182, + "step": 126 + }, + { + "epoch": 0.19, + "learning_rate": 9.62121212121212e-07, + "logits/chosen": -1.3037967681884766, + "logits/rejected": -1.2259622812271118, + "logps/chosen": -60.86170959472656, + "logps/rejected": -105.67593383789062, + "loss": 0.4194, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.348906934261322, + "rewards/margins": 2.006357431411743, + "rewards/rejected": -2.355264186859131, + "step": 127 + }, + { + "epoch": 0.19, + "learning_rate": 9.696969696969698e-07, + "logits/chosen": -1.259075403213501, + "logits/rejected": -1.2268812656402588, + "logps/chosen": -58.85130310058594, + "logps/rejected": -86.45140838623047, + "loss": 0.4037, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.3437346816062927, + "rewards/margins": 1.4063847064971924, + "rewards/rejected": -1.7501193284988403, + "step": 128 + }, + { + "epoch": 0.2, + "learning_rate": 9.772727272727273e-07, + "logits/chosen": -1.3443701267242432, + "logits/rejected": -1.3386574983596802, + "logps/chosen": -57.95923614501953, + "logps/rejected": -102.79905700683594, + "loss": 0.3809, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.22556929290294647, + "rewards/margins": 2.10101580619812, + "rewards/rejected": -2.326585292816162, + "step": 129 + }, + { + "epoch": 0.2, + "learning_rate": 9.848484848484847e-07, + "logits/chosen": -1.279806137084961, + "logits/rejected": -1.2831989526748657, + "logps/chosen": -64.24227905273438, + "logps/rejected": -79.32403564453125, + "loss": 0.3785, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.04765484482049942, + "rewards/margins": 1.1074268817901611, + "rewards/rejected": -1.155081868171692, + "step": 130 + }, + { + "epoch": 0.2, + "learning_rate": 9.924242424242425e-07, + "logits/chosen": -1.3265643119812012, + "logits/rejected": -1.3697947263717651, + "logps/chosen": -94.21955871582031, + "logps/rejected": -132.9693603515625, + "loss": 0.3656, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.6402519941329956, + "rewards/margins": 2.4263713359832764, + "rewards/rejected": -3.0666234493255615, + "step": 131 + }, + { + "epoch": 0.2, + "learning_rate": 1e-06, + "logits/chosen": -1.4270625114440918, + "logits/rejected": -1.464454174041748, + "logps/chosen": -71.64118194580078, + "logps/rejected": -91.30876159667969, + "loss": 0.4357, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.22757545113563538, + "rewards/margins": 1.7800164222717285, + "rewards/rejected": -2.007591962814331, + "step": 132 + }, + { + "epoch": 0.2, + "learning_rate": 9.999982399050598e-07, + "logits/chosen": -1.386523723602295, + "logits/rejected": -1.3936899900436401, + "logps/chosen": -49.9768180847168, + "logps/rejected": -69.42765045166016, + "loss": 0.4158, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.28144755959510803, + "rewards/margins": 1.190410852432251, + "rewards/rejected": -0.9089633822441101, + "step": 133 + }, + { + "epoch": 0.2, + "learning_rate": 9.999929596326304e-07, + "logits/chosen": -1.3149131536483765, + "logits/rejected": -1.3466674089431763, + "logps/chosen": -68.77780151367188, + "logps/rejected": -82.54281616210938, + "loss": 0.4099, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.05727508291602135, + "rewards/margins": 1.5080410242080688, + "rewards/rejected": -1.5653159618377686, + "step": 134 + }, + { + "epoch": 0.21, + "learning_rate": 9.999841592198874e-07, + "logits/chosen": -1.215659260749817, + "logits/rejected": -1.192206621170044, + "logps/chosen": -60.977264404296875, + "logps/rejected": -91.90313720703125, + "loss": 0.3569, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.03573242574930191, + "rewards/margins": 1.7384833097457886, + "rewards/rejected": -1.7027509212493896, + "step": 135 + }, + { + "epoch": 0.21, + "learning_rate": 9.99971838728789e-07, + "logits/chosen": -1.2210043668746948, + "logits/rejected": -1.244004249572754, + "logps/chosen": -73.441650390625, + "logps/rejected": -119.06057739257812, + "loss": 0.3778, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.45418426394462585, + "rewards/margins": 2.3686211109161377, + "rewards/rejected": -2.822805643081665, + "step": 136 + }, + { + "epoch": 0.21, + "learning_rate": 9.99955998246076e-07, + "logits/chosen": -1.4085693359375, + "logits/rejected": -1.4438612461090088, + "logps/chosen": -51.40415954589844, + "logps/rejected": -74.1273193359375, + "loss": 0.3579, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.07564006000757217, + "rewards/margins": 1.2299882173538208, + "rewards/rejected": -1.1543481349945068, + "step": 137 + }, + { + "epoch": 0.21, + "learning_rate": 9.99936637883271e-07, + "logits/chosen": -1.30047607421875, + "logits/rejected": -1.3149194717407227, + "logps/chosen": -62.247467041015625, + "logps/rejected": -81.31385803222656, + "loss": 0.3915, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.048665329813957214, + "rewards/margins": 1.2062422037124634, + "rewards/rejected": -1.1575767993927002, + "step": 138 + }, + { + "epoch": 0.21, + "learning_rate": 9.999137577766792e-07, + "logits/chosen": -1.3509010076522827, + "logits/rejected": -1.3184211254119873, + "logps/chosen": -56.17451095581055, + "logps/rejected": -91.11962890625, + "loss": 0.3801, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.08695222437381744, + "rewards/margins": 1.9988842010498047, + "rewards/rejected": -2.085836410522461, + "step": 139 + }, + { + "epoch": 0.21, + "learning_rate": 9.998873580873846e-07, + "logits/chosen": -1.379082202911377, + "logits/rejected": -1.3597187995910645, + "logps/chosen": -68.40926361083984, + "logps/rejected": -92.97391510009766, + "loss": 0.4229, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.08268654346466064, + "rewards/margins": 0.9664369225502014, + "rewards/rejected": -1.0491235256195068, + "step": 140 + }, + { + "epoch": 0.21, + "learning_rate": 9.998574390012513e-07, + "logits/chosen": -1.411400556564331, + "logits/rejected": -1.4529073238372803, + "logps/chosen": -61.78650665283203, + "logps/rejected": -81.70208740234375, + "loss": 0.3474, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.1299721598625183, + "rewards/margins": 1.5512878894805908, + "rewards/rejected": -1.6812599897384644, + "step": 141 + }, + { + "epoch": 0.22, + "learning_rate": 9.99824000728921e-07, + "logits/chosen": -1.2770847082138062, + "logits/rejected": -1.2510144710540771, + "logps/chosen": -39.36260986328125, + "logps/rejected": -66.36067199707031, + "loss": 0.3939, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.3309634327888489, + "rewards/margins": 1.3281357288360596, + "rewards/rejected": -0.9971722364425659, + "step": 142 + }, + { + "epoch": 0.22, + "learning_rate": 9.997870435058115e-07, + "logits/chosen": -1.340492606163025, + "logits/rejected": -1.3173463344573975, + "logps/chosen": -49.027099609375, + "logps/rejected": -73.99095153808594, + "loss": 0.3731, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.4593149721622467, + "rewards/margins": 1.2830872535705566, + "rewards/rejected": -0.8237722516059875, + "step": 143 + }, + { + "epoch": 0.22, + "learning_rate": 9.997465675921162e-07, + "logits/chosen": -1.1999183893203735, + "logits/rejected": -1.1477478742599487, + "logps/chosen": -68.66267395019531, + "logps/rejected": -112.85301971435547, + "loss": 0.3905, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.46990731358528137, + "rewards/margins": 2.243889808654785, + "rewards/rejected": -2.713797092437744, + "step": 144 + }, + { + "epoch": 0.22, + "learning_rate": 9.997025732728006e-07, + "logits/chosen": -1.4426615238189697, + "logits/rejected": -1.471621036529541, + "logps/chosen": -81.21881103515625, + "logps/rejected": -91.69154357910156, + "loss": 0.4525, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.194870114326477, + "rewards/margins": 0.6878187656402588, + "rewards/rejected": -1.8826889991760254, + "step": 145 + }, + { + "epoch": 0.22, + "learning_rate": 9.996550608576013e-07, + "logits/chosen": -1.2865259647369385, + "logits/rejected": -1.3044980764389038, + "logps/chosen": -53.92206954956055, + "logps/rejected": -68.4139175415039, + "loss": 0.3055, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.4023975133895874, + "rewards/margins": 1.186065673828125, + "rewards/rejected": -0.7836681604385376, + "step": 146 + }, + { + "epoch": 0.22, + "learning_rate": 9.996040306810242e-07, + "logits/chosen": -1.2497471570968628, + "logits/rejected": -1.3371587991714478, + "logps/chosen": -59.97561264038086, + "logps/rejected": -72.31210327148438, + "loss": 0.3395, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.3099679946899414, + "rewards/margins": 1.4178704023361206, + "rewards/rejected": -1.1079025268554688, + "step": 147 + }, + { + "epoch": 0.22, + "learning_rate": 9.995494831023408e-07, + "logits/chosen": -1.3360226154327393, + "logits/rejected": -1.3658947944641113, + "logps/chosen": -55.79327392578125, + "logps/rejected": -78.41800689697266, + "loss": 0.3534, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.2654246985912323, + "rewards/margins": 1.3563873767852783, + "rewards/rejected": -1.0909627676010132, + "step": 148 + }, + { + "epoch": 0.23, + "learning_rate": 9.994914185055867e-07, + "logits/chosen": -1.3814332485198975, + "logits/rejected": -1.3996297121047974, + "logps/chosen": -65.52349853515625, + "logps/rejected": -92.29167175292969, + "loss": 0.372, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.14941172301769257, + "rewards/margins": 1.968096137046814, + "rewards/rejected": -1.8186845779418945, + "step": 149 + }, + { + "epoch": 0.23, + "learning_rate": 9.99429837299559e-07, + "logits/chosen": -1.2469031810760498, + "logits/rejected": -1.2378968000411987, + "logps/chosen": -55.9729118347168, + "logps/rejected": -81.20050811767578, + "loss": 0.3292, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.061600759625434875, + "rewards/margins": 1.8637723922729492, + "rewards/rejected": -1.9253731966018677, + "step": 150 + }, + { + "epoch": 0.23, + "learning_rate": 9.993647399178123e-07, + "logits/chosen": -1.1675657033920288, + "logits/rejected": -1.0918349027633667, + "logps/chosen": -59.07560348510742, + "logps/rejected": -112.66256713867188, + "loss": 0.368, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.7189707159996033, + "rewards/margins": 2.3366379737854004, + "rewards/rejected": -3.0556085109710693, + "step": 151 + }, + { + "epoch": 0.23, + "learning_rate": 9.992961268186572e-07, + "logits/chosen": -1.3458789587020874, + "logits/rejected": -1.322076439857483, + "logps/chosen": -50.53932571411133, + "logps/rejected": -84.67821502685547, + "loss": 0.3689, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.14475339651107788, + "rewards/margins": 1.5784586668014526, + "rewards/rejected": -1.43370521068573, + "step": 152 + }, + { + "epoch": 0.23, + "learning_rate": 9.992239984851562e-07, + "logits/chosen": -1.5444601774215698, + "logits/rejected": -1.5224881172180176, + "logps/chosen": -65.60322570800781, + "logps/rejected": -96.27873229980469, + "loss": 0.3173, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.181745707988739, + "rewards/margins": 1.8834576606750488, + "rewards/rejected": -2.0652034282684326, + "step": 153 + }, + { + "epoch": 0.23, + "learning_rate": 9.9914835542512e-07, + "logits/chosen": -1.2116072177886963, + "logits/rejected": -1.2211272716522217, + "logps/chosen": -66.64263916015625, + "logps/rejected": -97.93750762939453, + "loss": 0.3044, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.26799696683883667, + "rewards/margins": 2.5864644050598145, + "rewards/rejected": -2.318467617034912, + "step": 154 + }, + { + "epoch": 0.24, + "learning_rate": 9.990691981711042e-07, + "logits/chosen": -1.2259119749069214, + "logits/rejected": -1.3609488010406494, + "logps/chosen": -55.120872497558594, + "logps/rejected": -86.47576141357422, + "loss": 0.3215, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.20785972476005554, + "rewards/margins": 2.307239532470703, + "rewards/rejected": -2.0993800163269043, + "step": 155 + }, + { + "epoch": 0.24, + "learning_rate": 9.989865272804063e-07, + "logits/chosen": -1.2843170166015625, + "logits/rejected": -1.2973507642745972, + "logps/chosen": -68.40337371826172, + "logps/rejected": -103.91558074951172, + "loss": 0.2876, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.2823537588119507, + "rewards/margins": 2.520174026489258, + "rewards/rejected": -2.802527666091919, + "step": 156 + }, + { + "epoch": 0.24, + "learning_rate": 9.989003433350606e-07, + "logits/chosen": -1.3156423568725586, + "logits/rejected": -1.2813544273376465, + "logps/chosen": -49.433929443359375, + "logps/rejected": -66.11580657958984, + "loss": 0.4019, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.211796373128891, + "rewards/margins": 1.0712887048721313, + "rewards/rejected": -0.859492301940918, + "step": 157 + }, + { + "epoch": 0.24, + "learning_rate": 9.988106469418345e-07, + "logits/chosen": -1.396899700164795, + "logits/rejected": -1.3826349973678589, + "logps/chosen": -49.73061752319336, + "logps/rejected": -65.69740295410156, + "loss": 0.3342, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.2982715368270874, + "rewards/margins": 1.3624143600463867, + "rewards/rejected": -1.0641428232192993, + "step": 158 + }, + { + "epoch": 0.24, + "learning_rate": 9.98717438732225e-07, + "logits/chosen": -1.414467692375183, + "logits/rejected": -1.382720708847046, + "logps/chosen": -55.65098571777344, + "logps/rejected": -86.2964859008789, + "loss": 0.2899, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.11677652597427368, + "rewards/margins": 2.016666889190674, + "rewards/rejected": -2.1334433555603027, + "step": 159 + }, + { + "epoch": 0.24, + "learning_rate": 9.986207193624536e-07, + "logits/chosen": -1.189499020576477, + "logits/rejected": -1.2207542657852173, + "logps/chosen": -56.323951721191406, + "logps/rejected": -104.06111145019531, + "loss": 0.2893, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.07249424606561661, + "rewards/margins": 2.821072816848755, + "rewards/rejected": -2.7485785484313965, + "step": 160 + }, + { + "epoch": 0.24, + "learning_rate": 9.985204895134607e-07, + "logits/chosen": -1.0789204835891724, + "logits/rejected": -1.0679121017456055, + "logps/chosen": -60.86206817626953, + "logps/rejected": -104.61509704589844, + "loss": 0.3352, + "rewards/accuracies": 0.8125, + "rewards/chosen": 0.19751030206680298, + "rewards/margins": 2.8937127590179443, + "rewards/rejected": -2.696202516555786, + "step": 161 + }, + { + "epoch": 0.25, + "learning_rate": 9.98416749890903e-07, + "logits/chosen": -1.1677380800247192, + "logits/rejected": -1.162474274635315, + "logps/chosen": -74.80203247070312, + "logps/rejected": -125.43745422363281, + "loss": 0.3155, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.34360015392303467, + "rewards/margins": 3.467597723007202, + "rewards/rejected": -3.8111977577209473, + "step": 162 + }, + { + "epoch": 0.25, + "learning_rate": 9.983095012251467e-07, + "logits/chosen": -1.2328966856002808, + "logits/rejected": -1.223034381866455, + "logps/chosen": -50.8116455078125, + "logps/rejected": -91.66698455810547, + "loss": 0.276, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.16697055101394653, + "rewards/margins": 2.390049695968628, + "rewards/rejected": -2.223079204559326, + "step": 163 + }, + { + "epoch": 0.25, + "learning_rate": 9.98198744271263e-07, + "logits/chosen": -1.2168715000152588, + "logits/rejected": -1.3405088186264038, + "logps/chosen": -74.32440185546875, + "logps/rejected": -107.49734497070312, + "loss": 0.3153, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.514243483543396, + "rewards/margins": 3.4297971725463867, + "rewards/rejected": -3.9440410137176514, + "step": 164 + }, + { + "epoch": 0.25, + "learning_rate": 9.980844798090233e-07, + "logits/chosen": -1.2657420635223389, + "logits/rejected": -1.2185579538345337, + "logps/chosen": -87.26468658447266, + "logps/rejected": -117.57492065429688, + "loss": 0.3192, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.79978346824646, + "rewards/margins": 2.257918357849121, + "rewards/rejected": -4.05770206451416, + "step": 165 + }, + { + "epoch": 0.25, + "learning_rate": 9.979667086428925e-07, + "logits/chosen": -1.0757566690444946, + "logits/rejected": -1.0354247093200684, + "logps/chosen": -57.32990646362305, + "logps/rejected": -102.35804748535156, + "loss": 0.3785, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.5685438513755798, + "rewards/margins": 2.6311147212982178, + "rewards/rejected": -3.1996583938598633, + "step": 166 + }, + { + "epoch": 0.25, + "learning_rate": 9.978454316020244e-07, + "logits/chosen": -1.0912585258483887, + "logits/rejected": -1.1047651767730713, + "logps/chosen": -78.06692504882812, + "logps/rejected": -97.17657470703125, + "loss": 0.3359, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.1147944927215576, + "rewards/margins": 1.6447203159332275, + "rewards/rejected": -2.759514808654785, + "step": 167 + }, + { + "epoch": 0.26, + "learning_rate": 9.977206495402552e-07, + "logits/chosen": -1.2980186939239502, + "logits/rejected": -1.243241786956787, + "logps/chosen": -57.57561111450195, + "logps/rejected": -105.88298034667969, + "loss": 0.2606, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.19986918568611145, + "rewards/margins": 2.2533702850341797, + "rewards/rejected": -2.453239679336548, + "step": 168 + }, + { + "epoch": 0.26, + "learning_rate": 9.975923633360984e-07, + "logits/chosen": -1.2510402202606201, + "logits/rejected": -1.2807258367538452, + "logps/chosen": -72.90382385253906, + "logps/rejected": -104.51939392089844, + "loss": 0.3472, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.4487581253051758, + "rewards/margins": 3.244266986846924, + "rewards/rejected": -3.6930251121520996, + "step": 169 + }, + { + "epoch": 0.26, + "learning_rate": 9.974605738927374e-07, + "logits/chosen": -1.0293492078781128, + "logits/rejected": -1.0240594148635864, + "logps/chosen": -54.199405670166016, + "logps/rejected": -85.39093780517578, + "loss": 0.3387, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.00886218249797821, + "rewards/margins": 2.4028754234313965, + "rewards/rejected": -2.3940131664276123, + "step": 170 + }, + { + "epoch": 0.26, + "learning_rate": 9.973252821380198e-07, + "logits/chosen": -1.195522665977478, + "logits/rejected": -1.2614532709121704, + "logps/chosen": -57.30488967895508, + "logps/rejected": -89.09839630126953, + "loss": 0.3472, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.08645845949649811, + "rewards/margins": 2.9136736392974854, + "rewards/rejected": -2.8272151947021484, + "step": 171 + }, + { + "epoch": 0.26, + "learning_rate": 9.971864890244513e-07, + "logits/chosen": -1.032387137413025, + "logits/rejected": -0.9830933809280396, + "logps/chosen": -81.36624145507812, + "logps/rejected": -131.3253936767578, + "loss": 0.276, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.3707561492919922, + "rewards/margins": 2.856058359146118, + "rewards/rejected": -4.226814270019531, + "step": 172 + }, + { + "epoch": 0.26, + "learning_rate": 9.970441955291877e-07, + "logits/chosen": -1.1625866889953613, + "logits/rejected": -1.2023017406463623, + "logps/chosen": -70.9615707397461, + "logps/rejected": -101.77046966552734, + "loss": 0.2518, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.08144064992666245, + "rewards/margins": 2.5768473148345947, + "rewards/rejected": -2.65828800201416, + "step": 173 + }, + { + "epoch": 0.26, + "learning_rate": 9.968984026540296e-07, + "logits/chosen": -1.0575189590454102, + "logits/rejected": -1.1296477317810059, + "logps/chosen": -44.96015167236328, + "logps/rejected": -86.93184661865234, + "loss": 0.2624, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.8304497599601746, + "rewards/margins": 3.159740686416626, + "rewards/rejected": -2.3292911052703857, + "step": 174 + }, + { + "epoch": 0.27, + "learning_rate": 9.96749111425414e-07, + "logits/chosen": -1.0364952087402344, + "logits/rejected": -1.1027450561523438, + "logps/chosen": -69.6294937133789, + "logps/rejected": -111.92215728759766, + "loss": 0.2836, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.2097761183977127, + "rewards/margins": 2.9586079120635986, + "rewards/rejected": -3.168384313583374, + "step": 175 + }, + { + "epoch": 0.27, + "learning_rate": 9.965963228944076e-07, + "logits/chosen": -1.1743711233139038, + "logits/rejected": -1.1658631563186646, + "logps/chosen": -57.486873626708984, + "logps/rejected": -104.13549041748047, + "loss": 0.2484, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.3816438317298889, + "rewards/margins": 3.8634872436523438, + "rewards/rejected": -3.4818437099456787, + "step": 176 + }, + { + "epoch": 0.27, + "learning_rate": 9.964400381367002e-07, + "logits/chosen": -1.299519658088684, + "logits/rejected": -1.297895908355713, + "logps/chosen": -53.962806701660156, + "logps/rejected": -98.18833923339844, + "loss": 0.2612, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.29737111926078796, + "rewards/margins": 3.225770950317383, + "rewards/rejected": -2.9283993244171143, + "step": 177 + }, + { + "epoch": 0.27, + "learning_rate": 9.962802582525957e-07, + "logits/chosen": -1.2027112245559692, + "logits/rejected": -1.1979259252548218, + "logps/chosen": -52.533470153808594, + "logps/rejected": -99.93737030029297, + "loss": 0.3, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.5650640726089478, + "rewards/margins": 3.342776298522949, + "rewards/rejected": -2.777712106704712, + "step": 178 + }, + { + "epoch": 0.27, + "learning_rate": 9.96116984367005e-07, + "logits/chosen": -1.3352652788162231, + "logits/rejected": -1.313057541847229, + "logps/chosen": -60.1168327331543, + "logps/rejected": -139.58270263671875, + "loss": 0.376, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.5508058667182922, + "rewards/margins": 4.521890163421631, + "rewards/rejected": -3.9710841178894043, + "step": 179 + }, + { + "epoch": 0.27, + "learning_rate": 9.959502176294382e-07, + "logits/chosen": -1.3684498071670532, + "logits/rejected": -1.4441702365875244, + "logps/chosen": -47.63343811035156, + "logps/rejected": -90.16297912597656, + "loss": 0.2907, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.8288148641586304, + "rewards/margins": 3.3544418811798096, + "rewards/rejected": -2.525627374649048, + "step": 180 + }, + { + "epoch": 0.27, + "learning_rate": 9.95779959213997e-07, + "logits/chosen": -1.2334184646606445, + "logits/rejected": -1.2272242307662964, + "logps/chosen": -55.64075469970703, + "logps/rejected": -104.79720306396484, + "loss": 0.2692, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.4517117738723755, + "rewards/margins": 3.394285202026367, + "rewards/rejected": -2.9425735473632812, + "step": 181 + }, + { + "epoch": 0.28, + "learning_rate": 9.956062103193646e-07, + "logits/chosen": -1.1894168853759766, + "logits/rejected": -1.1877219676971436, + "logps/chosen": -70.23963928222656, + "logps/rejected": -126.3914794921875, + "loss": 0.2532, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.06568023562431335, + "rewards/margins": 3.421466588973999, + "rewards/rejected": -3.487147092819214, + "step": 182 + }, + { + "epoch": 0.28, + "learning_rate": 9.954289721687996e-07, + "logits/chosen": -0.8898589015007019, + "logits/rejected": -0.928949773311615, + "logps/chosen": -75.32648468017578, + "logps/rejected": -101.60359191894531, + "loss": 0.2379, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.34705063700675964, + "rewards/margins": 2.680817127227783, + "rewards/rejected": -3.0278680324554443, + "step": 183 + }, + { + "epoch": 0.28, + "learning_rate": 9.95248246010126e-07, + "logits/chosen": -1.1582351922988892, + "logits/rejected": -1.1300815343856812, + "logps/chosen": -68.10409545898438, + "logps/rejected": -137.97059631347656, + "loss": 0.2815, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.38671061396598816, + "rewards/margins": 4.390896797180176, + "rewards/rejected": -4.777607440948486, + "step": 184 + }, + { + "epoch": 0.28, + "learning_rate": 9.95064033115724e-07, + "logits/chosen": -1.1521022319793701, + "logits/rejected": -1.104382872581482, + "logps/chosen": -68.8747329711914, + "logps/rejected": -105.67164611816406, + "loss": 0.3112, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.5325960516929626, + "rewards/margins": 2.1952645778656006, + "rewards/rejected": -2.727860689163208, + "step": 185 + }, + { + "epoch": 0.28, + "learning_rate": 9.948763347825228e-07, + "logits/chosen": -0.9579401016235352, + "logits/rejected": -0.9352121949195862, + "logps/chosen": -47.36273193359375, + "logps/rejected": -93.3355941772461, + "loss": 0.3195, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.1721753031015396, + "rewards/margins": 3.135554313659668, + "rewards/rejected": -3.307729721069336, + "step": 186 + }, + { + "epoch": 0.28, + "learning_rate": 9.946851523319902e-07, + "logits/chosen": -1.2986584901809692, + "logits/rejected": -1.3009908199310303, + "logps/chosen": -62.94734573364258, + "logps/rejected": -113.10549926757812, + "loss": 0.28, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.2780383825302124, + "rewards/margins": 3.869349241256714, + "rewards/rejected": -4.147387504577637, + "step": 187 + }, + { + "epoch": 0.29, + "learning_rate": 9.944904871101226e-07, + "logits/chosen": -1.2574782371520996, + "logits/rejected": -1.2606135606765747, + "logps/chosen": -70.70069885253906, + "logps/rejected": -127.10391998291016, + "loss": 0.1957, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.3717065453529358, + "rewards/margins": 3.6318910121917725, + "rewards/rejected": -4.003597259521484, + "step": 188 + }, + { + "epoch": 0.29, + "learning_rate": 9.942923404874375e-07, + "logits/chosen": -1.2451214790344238, + "logits/rejected": -1.2099590301513672, + "logps/chosen": -77.06783294677734, + "logps/rejected": -159.39584350585938, + "loss": 0.2788, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.39847332239151, + "rewards/margins": 5.332844257354736, + "rewards/rejected": -5.731317520141602, + "step": 189 + }, + { + "epoch": 0.29, + "learning_rate": 9.940907138589622e-07, + "logits/chosen": -1.1888874769210815, + "logits/rejected": -1.2535089254379272, + "logps/chosen": -50.92986297607422, + "logps/rejected": -92.7094955444336, + "loss": 0.2835, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.09499190747737885, + "rewards/margins": 3.2975873947143555, + "rewards/rejected": -3.3925790786743164, + "step": 190 + }, + { + "epoch": 0.29, + "learning_rate": 9.93885608644225e-07, + "logits/chosen": -1.2434340715408325, + "logits/rejected": -1.249043583869934, + "logps/chosen": -60.137359619140625, + "logps/rejected": -101.47135925292969, + "loss": 0.2708, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.4408572316169739, + "rewards/margins": 3.2096920013427734, + "rewards/rejected": -2.768834352493286, + "step": 191 + }, + { + "epoch": 0.29, + "learning_rate": 9.936770262872443e-07, + "logits/chosen": -1.2576887607574463, + "logits/rejected": -1.2570699453353882, + "logps/chosen": -64.34721374511719, + "logps/rejected": -108.09801483154297, + "loss": 0.3193, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.3119131922721863, + "rewards/margins": 3.3604087829589844, + "rewards/rejected": -3.048495292663574, + "step": 192 + }, + { + "epoch": 0.29, + "learning_rate": 9.934649682565191e-07, + "logits/chosen": -1.1765260696411133, + "logits/rejected": -1.2171003818511963, + "logps/chosen": -54.831790924072266, + "logps/rejected": -84.8888931274414, + "loss": 0.2554, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.1674397885799408, + "rewards/margins": 2.4152469635009766, + "rewards/rejected": -2.247807264328003, + "step": 193 + }, + { + "epoch": 0.29, + "learning_rate": 9.932494360450184e-07, + "logits/chosen": -1.1340199708938599, + "logits/rejected": -1.1504125595092773, + "logps/chosen": -70.6257095336914, + "logps/rejected": -103.02034759521484, + "loss": 0.242, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.10550439357757568, + "rewards/margins": 3.1961519718170166, + "rewards/rejected": -3.0906476974487305, + "step": 194 + }, + { + "epoch": 0.3, + "learning_rate": 9.930304311701708e-07, + "logits/chosen": -1.1937754154205322, + "logits/rejected": -1.2025905847549438, + "logps/chosen": -50.76273727416992, + "logps/rejected": -83.76286315917969, + "loss": 0.2905, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.6456944346427917, + "rewards/margins": 2.7552313804626465, + "rewards/rejected": -2.109536647796631, + "step": 195 + }, + { + "epoch": 0.3, + "learning_rate": 9.928079551738541e-07, + "logits/chosen": -1.2389689683914185, + "logits/rejected": -1.2017418146133423, + "logps/chosen": -63.504573822021484, + "logps/rejected": -107.82794952392578, + "loss": 0.2614, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.2145373821258545, + "rewards/margins": 2.6952617168426514, + "rewards/rejected": -2.480724334716797, + "step": 196 + }, + { + "epoch": 0.3, + "learning_rate": 9.925820096223836e-07, + "logits/chosen": -1.2189017534255981, + "logits/rejected": -1.194325566291809, + "logps/chosen": -69.2046127319336, + "logps/rejected": -106.79684448242188, + "loss": 0.3245, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.10513018071651459, + "rewards/margins": 2.890178918838501, + "rewards/rejected": -2.9953088760375977, + "step": 197 + }, + { + "epoch": 0.3, + "learning_rate": 9.923525961065017e-07, + "logits/chosen": -1.2128559350967407, + "logits/rejected": -1.1879463195800781, + "logps/chosen": -47.020294189453125, + "logps/rejected": -67.4661636352539, + "loss": 0.2451, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.01765884831547737, + "rewards/margins": 1.2542365789413452, + "rewards/rejected": -1.271895408630371, + "step": 198 + }, + { + "epoch": 0.3, + "learning_rate": 9.92119716241367e-07, + "logits/chosen": -1.1332924365997314, + "logits/rejected": -1.2118947505950928, + "logps/chosen": -74.53262329101562, + "logps/rejected": -114.64993286132812, + "loss": 0.2726, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.5381104946136475, + "rewards/margins": 3.5477006435394287, + "rewards/rejected": -4.085811138153076, + "step": 199 + }, + { + "epoch": 0.3, + "learning_rate": 9.918833716665418e-07, + "logits/chosen": -1.3250250816345215, + "logits/rejected": -1.3644132614135742, + "logps/chosen": -57.79987335205078, + "logps/rejected": -91.33990478515625, + "loss": 0.2522, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.40653082728385925, + "rewards/margins": 2.3988020420074463, + "rewards/rejected": -2.805332899093628, + "step": 200 + }, + { + "epoch": 0.31, + "learning_rate": 9.916435640459816e-07, + "logits/chosen": -1.116808295249939, + "logits/rejected": -1.1277410984039307, + "logps/chosen": -58.26860427856445, + "logps/rejected": -125.79907989501953, + "loss": 0.1899, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.07275482267141342, + "rewards/margins": 4.3385233879089355, + "rewards/rejected": -4.265768051147461, + "step": 201 + }, + { + "epoch": 0.31, + "learning_rate": 9.914002950680238e-07, + "logits/chosen": -1.174288272857666, + "logits/rejected": -1.2112677097320557, + "logps/chosen": -67.45306396484375, + "logps/rejected": -127.85252380371094, + "loss": 0.243, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.22681477665901184, + "rewards/margins": 4.279499053955078, + "rewards/rejected": -4.506314277648926, + "step": 202 + }, + { + "epoch": 0.31, + "learning_rate": 9.911535664453736e-07, + "logits/chosen": -1.1215676069259644, + "logits/rejected": -1.160021185874939, + "logps/chosen": -74.76897430419922, + "logps/rejected": -114.86557006835938, + "loss": 0.224, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.1561341285705566, + "rewards/margins": 3.272386074066162, + "rewards/rejected": -4.428520202636719, + "step": 203 + }, + { + "epoch": 0.31, + "learning_rate": 9.909033799150946e-07, + "logits/chosen": -1.0842782258987427, + "logits/rejected": -1.1059097051620483, + "logps/chosen": -53.11720657348633, + "logps/rejected": -105.79139709472656, + "loss": 0.3057, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.6020134687423706, + "rewards/margins": 4.452319622039795, + "rewards/rejected": -3.850306272506714, + "step": 204 + }, + { + "epoch": 0.31, + "learning_rate": 9.906497372385948e-07, + "logits/chosen": -1.105309247970581, + "logits/rejected": -1.138174057006836, + "logps/chosen": -44.412696838378906, + "logps/rejected": -71.90165710449219, + "loss": 0.2139, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.4434084892272949, + "rewards/margins": 2.3051414489746094, + "rewards/rejected": -1.8617329597473145, + "step": 205 + }, + { + "epoch": 0.31, + "learning_rate": 9.90392640201615e-07, + "logits/chosen": -1.3656604290008545, + "logits/rejected": -1.3901586532592773, + "logps/chosen": -75.88610076904297, + "logps/rejected": -128.65650939941406, + "loss": 0.2505, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.6848592162132263, + "rewards/margins": 3.7529115676879883, + "rewards/rejected": -4.437770843505859, + "step": 206 + }, + { + "epoch": 0.31, + "learning_rate": 9.901320906142164e-07, + "logits/chosen": -1.0678926706314087, + "logits/rejected": -1.1116093397140503, + "logps/chosen": -58.641170501708984, + "logps/rejected": -92.04139709472656, + "loss": 0.3181, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.1833564043045044, + "rewards/margins": 3.2483644485473633, + "rewards/rejected": -3.0650081634521484, + "step": 207 + }, + { + "epoch": 0.32, + "learning_rate": 9.898680903107666e-07, + "logits/chosen": -1.4462071657180786, + "logits/rejected": -1.4025518894195557, + "logps/chosen": -52.31578826904297, + "logps/rejected": -99.5600357055664, + "loss": 0.2927, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.09301090240478516, + "rewards/margins": 3.4032158851623535, + "rewards/rejected": -3.4962263107299805, + "step": 208 + }, + { + "epoch": 0.32, + "learning_rate": 9.89600641149928e-07, + "logits/chosen": -1.1321766376495361, + "logits/rejected": -1.1258957386016846, + "logps/chosen": -60.00716018676758, + "logps/rejected": -103.17889404296875, + "loss": 0.2202, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.041563332080841064, + "rewards/margins": 3.4386918544769287, + "rewards/rejected": -3.3971283435821533, + "step": 209 + }, + { + "epoch": 0.32, + "learning_rate": 9.893297450146444e-07, + "logits/chosen": -1.1479976177215576, + "logits/rejected": -1.25742506980896, + "logps/chosen": -47.2739372253418, + "logps/rejected": -67.57801818847656, + "loss": 0.2437, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.7823903560638428, + "rewards/margins": 2.870223045349121, + "rewards/rejected": -2.0878329277038574, + "step": 210 + }, + { + "epoch": 0.32, + "learning_rate": 9.890554038121272e-07, + "logits/chosen": -1.3039093017578125, + "logits/rejected": -1.329890489578247, + "logps/chosen": -83.64131164550781, + "logps/rejected": -140.93768310546875, + "loss": 0.1856, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.3537253439426422, + "rewards/margins": 4.876173973083496, + "rewards/rejected": -5.229898929595947, + "step": 211 + }, + { + "epoch": 0.32, + "learning_rate": 9.887776194738431e-07, + "logits/chosen": -1.1970943212509155, + "logits/rejected": -1.1890867948532104, + "logps/chosen": -50.33533477783203, + "logps/rejected": -102.55012512207031, + "loss": 0.2216, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.2218342274427414, + "rewards/margins": 4.181835174560547, + "rewards/rejected": -3.960000991821289, + "step": 212 + }, + { + "epoch": 0.32, + "learning_rate": 9.88496393955499e-07, + "logits/chosen": -1.1353914737701416, + "logits/rejected": -1.090192198753357, + "logps/chosen": -60.373779296875, + "logps/rejected": -134.13621520996094, + "loss": 0.2127, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.05712069571018219, + "rewards/margins": 5.0528154373168945, + "rewards/rejected": -4.995695114135742, + "step": 213 + }, + { + "epoch": 0.33, + "learning_rate": 9.882117292370295e-07, + "logits/chosen": -1.2194281816482544, + "logits/rejected": -1.2036105394363403, + "logps/chosen": -57.46979904174805, + "logps/rejected": -101.6165542602539, + "loss": 0.2892, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.1329529583454132, + "rewards/margins": 3.2106685638427734, + "rewards/rejected": -3.0777151584625244, + "step": 214 + }, + { + "epoch": 0.33, + "learning_rate": 9.87923627322582e-07, + "logits/chosen": -1.1778866052627563, + "logits/rejected": -1.2296476364135742, + "logps/chosen": -68.89556121826172, + "logps/rejected": -125.4426498413086, + "loss": 0.2999, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.49690526723861694, + "rewards/margins": 4.3306884765625, + "rewards/rejected": -4.827593803405762, + "step": 215 + }, + { + "epoch": 0.33, + "learning_rate": 9.87632090240504e-07, + "logits/chosen": -1.0935524702072144, + "logits/rejected": -1.1171512603759766, + "logps/chosen": -55.67235565185547, + "logps/rejected": -103.5525131225586, + "loss": 0.2469, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.23011675477027893, + "rewards/margins": 4.043491840362549, + "rewards/rejected": -3.8133747577667236, + "step": 216 + }, + { + "epoch": 0.33, + "learning_rate": 9.873371200433268e-07, + "logits/chosen": -1.3669008016586304, + "logits/rejected": -1.4004020690917969, + "logps/chosen": -75.74365234375, + "logps/rejected": -142.23971557617188, + "loss": 0.2561, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.0013476610183716, + "rewards/margins": 4.953007221221924, + "rewards/rejected": -5.954354763031006, + "step": 217 + }, + { + "epoch": 0.33, + "learning_rate": 9.87038718807753e-07, + "logits/chosen": -1.1259562969207764, + "logits/rejected": -1.133103370666504, + "logps/chosen": -50.491188049316406, + "logps/rejected": -103.2459716796875, + "loss": 0.2994, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.14921081066131592, + "rewards/margins": 3.5174503326416016, + "rewards/rejected": -3.666661024093628, + "step": 218 + }, + { + "epoch": 0.33, + "learning_rate": 9.867368886346399e-07, + "logits/chosen": -1.2843149900436401, + "logits/rejected": -1.2164344787597656, + "logps/chosen": -56.01287078857422, + "logps/rejected": -117.17066955566406, + "loss": 0.2595, + "rewards/accuracies": 0.8125, + "rewards/chosen": 0.04519526660442352, + "rewards/margins": 3.4883525371551514, + "rewards/rejected": -3.443157196044922, + "step": 219 + }, + { + "epoch": 0.33, + "learning_rate": 9.864316316489872e-07, + "logits/chosen": -1.1004031896591187, + "logits/rejected": -1.053981065750122, + "logps/chosen": -57.81025314331055, + "logps/rejected": -111.32896423339844, + "loss": 0.3343, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.6295335292816162, + "rewards/margins": 3.3364522457122803, + "rewards/rejected": -3.9659860134124756, + "step": 220 + }, + { + "epoch": 0.34, + "learning_rate": 9.8612294999992e-07, + "logits/chosen": -1.2155131101608276, + "logits/rejected": -1.3413316011428833, + "logps/chosen": -44.69001770019531, + "logps/rejected": -102.89868927001953, + "loss": 0.2707, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.5029670000076294, + "rewards/margins": 4.66603422164917, + "rewards/rejected": -4.16306734085083, + "step": 221 + }, + { + "epoch": 0.34, + "learning_rate": 9.858108458606738e-07, + "logits/chosen": -1.024707317352295, + "logits/rejected": -1.0007303953170776, + "logps/chosen": -50.13225173950195, + "logps/rejected": -92.38772583007812, + "loss": 0.2625, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.6516220569610596, + "rewards/margins": 3.045931816101074, + "rewards/rejected": -2.3943097591400146, + "step": 222 + }, + { + "epoch": 0.34, + "learning_rate": 9.854953214285807e-07, + "logits/chosen": -1.240480899810791, + "logits/rejected": -1.207828402519226, + "logps/chosen": -60.037818908691406, + "logps/rejected": -124.46925354003906, + "loss": 0.3404, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.1628638654947281, + "rewards/margins": 4.1099772453308105, + "rewards/rejected": -4.272841453552246, + "step": 223 + }, + { + "epoch": 0.34, + "learning_rate": 9.851763789250525e-07, + "logits/chosen": -1.2094461917877197, + "logits/rejected": -1.18070650100708, + "logps/chosen": -48.78962326049805, + "logps/rejected": -94.73063659667969, + "loss": 0.1944, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.5968643426895142, + "rewards/margins": 3.2663774490356445, + "rewards/rejected": -2.669512987136841, + "step": 224 + }, + { + "epoch": 0.34, + "learning_rate": 9.848540205955653e-07, + "logits/chosen": -1.097152590751648, + "logits/rejected": -1.0911446809768677, + "logps/chosen": -64.8131103515625, + "logps/rejected": -99.00874328613281, + "loss": 0.2257, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.23178008198738098, + "rewards/margins": 3.4839677810668945, + "rewards/rejected": -3.7157483100891113, + "step": 225 + }, + { + "epoch": 0.34, + "learning_rate": 9.845282487096447e-07, + "logits/chosen": -1.266671061515808, + "logits/rejected": -1.2435628175735474, + "logps/chosen": -64.22138977050781, + "logps/rejected": -128.06129455566406, + "loss": 0.2491, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.2566155195236206, + "rewards/margins": 4.87740421295166, + "rewards/rejected": -4.620787620544434, + "step": 226 + }, + { + "epoch": 0.34, + "learning_rate": 9.841990655608478e-07, + "logits/chosen": -1.2105844020843506, + "logits/rejected": -1.2728626728057861, + "logps/chosen": -55.29273986816406, + "logps/rejected": -86.1402359008789, + "loss": 0.2898, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.6476824283599854, + "rewards/margins": 3.0936102867126465, + "rewards/rejected": -2.445927858352661, + "step": 227 + }, + { + "epoch": 0.35, + "learning_rate": 9.838664734667495e-07, + "logits/chosen": -1.390692114830017, + "logits/rejected": -1.3265589475631714, + "logps/chosen": -64.66275024414062, + "logps/rejected": -111.31877136230469, + "loss": 0.2887, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.8149236440658569, + "rewards/margins": 2.563906192779541, + "rewards/rejected": -3.3788299560546875, + "step": 228 + }, + { + "epoch": 0.35, + "learning_rate": 9.83530474768924e-07, + "logits/chosen": -1.4756298065185547, + "logits/rejected": -1.4475144147872925, + "logps/chosen": -56.14897155761719, + "logps/rejected": -102.84004974365234, + "loss": 0.2724, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.16208699345588684, + "rewards/margins": 3.177507162094116, + "rewards/rejected": -3.3395941257476807, + "step": 229 + }, + { + "epoch": 0.35, + "learning_rate": 9.831910718329301e-07, + "logits/chosen": -1.2161978483200073, + "logits/rejected": -1.305418848991394, + "logps/chosen": -67.6595687866211, + "logps/rejected": -103.6363754272461, + "loss": 0.3052, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.19699662923812866, + "rewards/margins": 3.100790500640869, + "rewards/rejected": -3.2977874279022217, + "step": 230 + }, + { + "epoch": 0.35, + "learning_rate": 9.828482670482934e-07, + "logits/chosen": -1.1499532461166382, + "logits/rejected": -1.1783616542816162, + "logps/chosen": -52.1451416015625, + "logps/rejected": -112.85889434814453, + "loss": 0.2487, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.501251757144928, + "rewards/margins": 4.617308139801025, + "rewards/rejected": -4.116055965423584, + "step": 231 + }, + { + "epoch": 0.35, + "learning_rate": 9.825020628284895e-07, + "logits/chosen": -1.0289162397384644, + "logits/rejected": -1.109811544418335, + "logps/chosen": -46.92026138305664, + "logps/rejected": -62.872901916503906, + "loss": 0.2997, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.0598130077123642, + "rewards/margins": 1.7254122495651245, + "rewards/rejected": -1.78522527217865, + "step": 232 + }, + { + "epoch": 0.35, + "learning_rate": 9.821524616109275e-07, + "logits/chosen": -1.3550623655319214, + "logits/rejected": -1.3795663118362427, + "logps/chosen": -51.85411071777344, + "logps/rejected": -72.90492248535156, + "loss": 0.2706, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.14976592361927032, + "rewards/margins": 2.097384214401245, + "rewards/rejected": -2.24714994430542, + "step": 233 + }, + { + "epoch": 0.36, + "learning_rate": 9.817994658569332e-07, + "logits/chosen": -1.3676050901412964, + "logits/rejected": -1.3363038301467896, + "logps/chosen": -64.43699645996094, + "logps/rejected": -100.00003814697266, + "loss": 0.2658, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.08322476595640182, + "rewards/margins": 2.7351748943328857, + "rewards/rejected": -2.818399429321289, + "step": 234 + }, + { + "epoch": 0.36, + "learning_rate": 9.814430780517304e-07, + "logits/chosen": -1.149186372756958, + "logits/rejected": -1.2141780853271484, + "logps/chosen": -55.30868911743164, + "logps/rejected": -103.93698120117188, + "loss": 0.1667, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.1783062219619751, + "rewards/margins": 3.7370057106018066, + "rewards/rejected": -3.558699131011963, + "step": 235 + }, + { + "epoch": 0.36, + "learning_rate": 9.810833007044246e-07, + "logits/chosen": -1.3775757551193237, + "logits/rejected": -1.3689650297164917, + "logps/chosen": -47.52128601074219, + "logps/rejected": -105.66326904296875, + "loss": 0.2027, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.23522144556045532, + "rewards/margins": 3.8973565101623535, + "rewards/rejected": -3.662134885787964, + "step": 236 + }, + { + "epoch": 0.36, + "learning_rate": 9.80720136347985e-07, + "logits/chosen": -1.1896703243255615, + "logits/rejected": -1.1428377628326416, + "logps/chosen": -67.61991119384766, + "logps/rejected": -140.35009765625, + "loss": 0.2339, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.19219180941581726, + "rewards/margins": 5.137165546417236, + "rewards/rejected": -5.329357147216797, + "step": 237 + }, + { + "epoch": 0.36, + "learning_rate": 9.80353587539227e-07, + "logits/chosen": -1.1677360534667969, + "logits/rejected": -1.2059762477874756, + "logps/chosen": -45.55229568481445, + "logps/rejected": -69.67662048339844, + "loss": 0.2635, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.6372277736663818, + "rewards/margins": 2.0313403606414795, + "rewards/rejected": -1.394112467765808, + "step": 238 + }, + { + "epoch": 0.36, + "learning_rate": 9.799836568587927e-07, + "logits/chosen": -1.1546509265899658, + "logits/rejected": -1.1933931112289429, + "logps/chosen": -43.91341781616211, + "logps/rejected": -63.79443359375, + "loss": 0.2546, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.706580400466919, + "rewards/margins": 1.6172070503234863, + "rewards/rejected": -0.9106266498565674, + "step": 239 + }, + { + "epoch": 0.36, + "learning_rate": 9.796103469111349e-07, + "logits/chosen": -1.25909423828125, + "logits/rejected": -1.2822978496551514, + "logps/chosen": -71.47257232666016, + "logps/rejected": -104.23622131347656, + "loss": 0.2619, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.5179034471511841, + "rewards/margins": 2.7368874549865723, + "rewards/rejected": -3.254790782928467, + "step": 240 + }, + { + "epoch": 0.37, + "learning_rate": 9.792336603244977e-07, + "logits/chosen": -1.4345574378967285, + "logits/rejected": -1.4878321886062622, + "logps/chosen": -83.87324523925781, + "logps/rejected": -128.76010131835938, + "loss": 0.2103, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.596457839012146, + "rewards/margins": 3.8281967639923096, + "rewards/rejected": -4.424654960632324, + "step": 241 + }, + { + "epoch": 0.37, + "learning_rate": 9.78853599750897e-07, + "logits/chosen": -1.3332566022872925, + "logits/rejected": -1.3457403182983398, + "logps/chosen": -73.25218963623047, + "logps/rejected": -116.97953033447266, + "loss": 0.2221, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.44591355323791504, + "rewards/margins": 4.094986915588379, + "rewards/rejected": -4.540900707244873, + "step": 242 + }, + { + "epoch": 0.37, + "learning_rate": 9.784701678661044e-07, + "logits/chosen": -1.2335759401321411, + "logits/rejected": -1.1885710954666138, + "logps/chosen": -60.02530288696289, + "logps/rejected": -103.60508728027344, + "loss": 0.2076, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.3031735420227051, + "rewards/margins": 3.3130042552948, + "rewards/rejected": -3.616178035736084, + "step": 243 + }, + { + "epoch": 0.37, + "learning_rate": 9.780833673696254e-07, + "logits/chosen": -1.1043503284454346, + "logits/rejected": -1.105530023574829, + "logps/chosen": -71.57035064697266, + "logps/rejected": -129.08958435058594, + "loss": 0.2174, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.9568031430244446, + "rewards/margins": 4.0563225746154785, + "rewards/rejected": -5.013125419616699, + "step": 244 + }, + { + "epoch": 0.37, + "learning_rate": 9.776932009846824e-07, + "logits/chosen": -1.1673836708068848, + "logits/rejected": -1.2674115896224976, + "logps/chosen": -68.6380615234375, + "logps/rejected": -120.55182647705078, + "loss": 0.3264, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.9497154951095581, + "rewards/margins": 4.242981433868408, + "rewards/rejected": -5.192696571350098, + "step": 245 + }, + { + "epoch": 0.37, + "learning_rate": 9.772996714581956e-07, + "logits/chosen": -1.362168788909912, + "logits/rejected": -1.3981695175170898, + "logps/chosen": -58.71257781982422, + "logps/rejected": -120.30799865722656, + "loss": 0.2208, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.5725867748260498, + "rewards/margins": 5.25115442276001, + "rewards/rejected": -4.678566932678223, + "step": 246 + }, + { + "epoch": 0.38, + "learning_rate": 9.769027815607614e-07, + "logits/chosen": -1.2044932842254639, + "logits/rejected": -1.2403883934020996, + "logps/chosen": -82.71932983398438, + "logps/rejected": -135.8838348388672, + "loss": 0.223, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.8911706209182739, + "rewards/margins": 4.8175129890441895, + "rewards/rejected": -5.708683967590332, + "step": 247 + }, + { + "epoch": 0.38, + "learning_rate": 9.76502534086636e-07, + "logits/chosen": -1.0574793815612793, + "logits/rejected": -1.0156506299972534, + "logps/chosen": -56.54512405395508, + "logps/rejected": -107.8660659790039, + "loss": 0.2031, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.5758581161499023, + "rewards/margins": 3.545283317565918, + "rewards/rejected": -2.9694252014160156, + "step": 248 + }, + { + "epoch": 0.38, + "learning_rate": 9.760989318537132e-07, + "logits/chosen": -1.0622340440750122, + "logits/rejected": -1.083043098449707, + "logps/chosen": -57.89778137207031, + "logps/rejected": -117.26451873779297, + "loss": 0.2621, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.18930283188819885, + "rewards/margins": 4.901935577392578, + "rewards/rejected": -5.091238021850586, + "step": 249 + }, + { + "epoch": 0.38, + "learning_rate": 9.756919777035064e-07, + "logits/chosen": -1.177422285079956, + "logits/rejected": -1.177398920059204, + "logps/chosen": -57.54464340209961, + "logps/rejected": -103.27391052246094, + "loss": 0.2271, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.6480403542518616, + "rewards/margins": 3.5536599159240723, + "rewards/rejected": -4.201700210571289, + "step": 250 + }, + { + "epoch": 0.38, + "learning_rate": 9.752816745011272e-07, + "logits/chosen": -1.1077170372009277, + "logits/rejected": -1.1722453832626343, + "logps/chosen": -40.07292938232422, + "logps/rejected": -72.1737060546875, + "loss": 0.1764, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.33403611183166504, + "rewards/margins": 3.103440999984741, + "rewards/rejected": -2.769404888153076, + "step": 251 + }, + { + "epoch": 0.38, + "learning_rate": 9.748680251352658e-07, + "logits/chosen": -1.0602819919586182, + "logits/rejected": -1.024991750717163, + "logps/chosen": -69.75623321533203, + "logps/rejected": -145.09066772460938, + "loss": 0.201, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.4222402274608612, + "rewards/margins": 5.389873027801514, + "rewards/rejected": -5.812112808227539, + "step": 252 + }, + { + "epoch": 0.38, + "learning_rate": 9.744510325181711e-07, + "logits/chosen": -1.0655499696731567, + "logits/rejected": -1.152011513710022, + "logps/chosen": -72.12405395507812, + "logps/rejected": -142.640625, + "loss": 0.2077, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.5751222372055054, + "rewards/margins": 5.4383440017700195, + "rewards/rejected": -6.0134663581848145, + "step": 253 + }, + { + "epoch": 0.39, + "learning_rate": 9.740306995856293e-07, + "logits/chosen": -1.2082499265670776, + "logits/rejected": -1.2432973384857178, + "logps/chosen": -50.88384246826172, + "logps/rejected": -86.19475555419922, + "loss": 0.2242, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.03699183464050293, + "rewards/margins": 3.2802982330322266, + "rewards/rejected": -3.2433063983917236, + "step": 254 + }, + { + "epoch": 0.39, + "learning_rate": 9.73607029296944e-07, + "logits/chosen": -1.1245856285095215, + "logits/rejected": -1.0869303941726685, + "logps/chosen": -47.463680267333984, + "logps/rejected": -108.2414779663086, + "loss": 0.3035, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.35966920852661133, + "rewards/margins": 4.96986722946167, + "rewards/rejected": -4.610198020935059, + "step": 255 + }, + { + "epoch": 0.39, + "learning_rate": 9.731800246349147e-07, + "logits/chosen": -1.1222176551818848, + "logits/rejected": -1.070405125617981, + "logps/chosen": -62.034725189208984, + "logps/rejected": -150.425537109375, + "loss": 0.2123, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.1317824423313141, + "rewards/margins": 5.905609130859375, + "rewards/rejected": -5.773827075958252, + "step": 256 + }, + { + "epoch": 0.39, + "learning_rate": 9.727496886058167e-07, + "logits/chosen": -1.0427227020263672, + "logits/rejected": -1.0754176378250122, + "logps/chosen": -69.98577117919922, + "logps/rejected": -121.99186706542969, + "loss": 0.2175, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.5424370765686035, + "rewards/margins": 4.675653457641602, + "rewards/rejected": -5.218091011047363, + "step": 257 + }, + { + "epoch": 0.39, + "learning_rate": 9.723160242393786e-07, + "logits/chosen": -1.2104133367538452, + "logits/rejected": -1.2432409524917603, + "logps/chosen": -60.75482940673828, + "logps/rejected": -103.19210052490234, + "loss": 0.1689, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.21256500482559204, + "rewards/margins": 3.6828160285949707, + "rewards/rejected": -3.895380973815918, + "step": 258 + }, + { + "epoch": 0.39, + "learning_rate": 9.718790345887628e-07, + "logits/chosen": -1.1556942462921143, + "logits/rejected": -1.2230334281921387, + "logps/chosen": -57.88835144042969, + "logps/rejected": -95.50018310546875, + "loss": 0.2567, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.3873271942138672, + "rewards/margins": 3.189028739929199, + "rewards/rejected": -3.5763559341430664, + "step": 259 + }, + { + "epoch": 0.39, + "learning_rate": 9.71438722730542e-07, + "logits/chosen": -1.127741813659668, + "logits/rejected": -1.088086485862732, + "logps/chosen": -68.16934204101562, + "logps/rejected": -144.72586059570312, + "loss": 0.2004, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.0546388179063797, + "rewards/margins": 6.129822731018066, + "rewards/rejected": -6.1844611167907715, + "step": 260 + }, + { + "epoch": 0.4, + "learning_rate": 9.70995091764679e-07, + "logits/chosen": -1.1576268672943115, + "logits/rejected": -1.1194590330123901, + "logps/chosen": -62.47630310058594, + "logps/rejected": -138.0134735107422, + "loss": 0.1726, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.17312532663345337, + "rewards/margins": 5.465775489807129, + "rewards/rejected": -5.29265022277832, + "step": 261 + }, + { + "epoch": 0.4, + "learning_rate": 9.705481448145044e-07, + "logits/chosen": -1.1115840673446655, + "logits/rejected": -1.0342121124267578, + "logps/chosen": -70.50397491455078, + "logps/rejected": -139.21095275878906, + "loss": 0.214, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.6507506966590881, + "rewards/margins": 5.1096391677856445, + "rewards/rejected": -5.760389804840088, + "step": 262 + }, + { + "epoch": 0.4, + "learning_rate": 9.700978850266943e-07, + "logits/chosen": -1.100415825843811, + "logits/rejected": -1.117796540260315, + "logps/chosen": -35.40311050415039, + "logps/rejected": -65.78343200683594, + "loss": 0.2342, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.6401301622390747, + "rewards/margins": 2.996103286743164, + "rewards/rejected": -2.355973243713379, + "step": 263 + }, + { + "epoch": 0.4, + "learning_rate": 9.696443155712487e-07, + "logits/chosen": -0.9698693752288818, + "logits/rejected": -0.9955704212188721, + "logps/chosen": -51.29049301147461, + "logps/rejected": -91.20393371582031, + "loss": 0.2086, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.1662522405385971, + "rewards/margins": 3.645151138305664, + "rewards/rejected": -3.478898763656616, + "step": 264 + }, + { + "epoch": 0.4, + "learning_rate": 9.691874396414685e-07, + "logits/chosen": -0.9918010234832764, + "logits/rejected": -0.9242657423019409, + "logps/chosen": -60.58504104614258, + "logps/rejected": -118.39722442626953, + "loss": 0.2594, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.3300020694732666, + "rewards/margins": 4.74561071395874, + "rewards/rejected": -5.075613021850586, + "step": 265 + }, + { + "epoch": 0.4, + "learning_rate": 9.687272604539342e-07, + "logits/chosen": -1.218865990638733, + "logits/rejected": -1.1760666370391846, + "logps/chosen": -59.44304656982422, + "logps/rejected": -107.35780334472656, + "loss": 0.2932, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.07953006029129028, + "rewards/margins": 3.563913345336914, + "rewards/rejected": -3.6434435844421387, + "step": 266 + }, + { + "epoch": 0.41, + "learning_rate": 9.68263781248482e-07, + "logits/chosen": -1.1206682920455933, + "logits/rejected": -1.141585350036621, + "logps/chosen": -69.39141845703125, + "logps/rejected": -125.5979232788086, + "loss": 0.2859, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.17774266004562378, + "rewards/margins": 3.902479410171509, + "rewards/rejected": -4.080222129821777, + "step": 267 + }, + { + "epoch": 0.41, + "learning_rate": 9.67797005288181e-07, + "logits/chosen": -1.0456727743148804, + "logits/rejected": -1.0675475597381592, + "logps/chosen": -90.54875183105469, + "logps/rejected": -148.25794982910156, + "loss": 0.2292, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.8046572208404541, + "rewards/margins": 5.290049076080322, + "rewards/rejected": -6.094706058502197, + "step": 268 + }, + { + "epoch": 0.41, + "learning_rate": 9.67326935859312e-07, + "logits/chosen": -1.1662824153900146, + "logits/rejected": -1.1143728494644165, + "logps/chosen": -57.08251953125, + "logps/rejected": -104.96754455566406, + "loss": 0.2181, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.577634334564209, + "rewards/margins": 3.61334228515625, + "rewards/rejected": -4.190977096557617, + "step": 269 + }, + { + "epoch": 0.41, + "learning_rate": 9.668535762713415e-07, + "logits/chosen": -1.2060467004776, + "logits/rejected": -1.277174472808838, + "logps/chosen": -64.08255004882812, + "logps/rejected": -100.80244445800781, + "loss": 0.1997, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.6261399388313293, + "rewards/margins": 3.175387144088745, + "rewards/rejected": -3.8015270233154297, + "step": 270 + }, + { + "epoch": 0.41, + "learning_rate": 9.663769298569013e-07, + "logits/chosen": -1.2676234245300293, + "logits/rejected": -1.3310580253601074, + "logps/chosen": -70.11847686767578, + "logps/rejected": -106.50775909423828, + "loss": 0.2802, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.3052479028701782, + "rewards/margins": 3.024759531021118, + "rewards/rejected": -4.330007553100586, + "step": 271 + }, + { + "epoch": 0.41, + "learning_rate": 9.65896999971763e-07, + "logits/chosen": -1.2232937812805176, + "logits/rejected": -1.1774309873580933, + "logps/chosen": -81.23110961914062, + "logps/rejected": -145.268798828125, + "loss": 0.2673, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.7276835441589355, + "rewards/margins": 4.985037326812744, + "rewards/rejected": -5.71272087097168, + "step": 272 + }, + { + "epoch": 0.41, + "learning_rate": 9.654137899948155e-07, + "logits/chosen": -1.0879088640213013, + "logits/rejected": -1.1381860971450806, + "logps/chosen": -50.63553237915039, + "logps/rejected": -112.82023620605469, + "loss": 0.3102, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.28322744369506836, + "rewards/margins": 4.86517858505249, + "rewards/rejected": -4.581951141357422, + "step": 273 + }, + { + "epoch": 0.42, + "learning_rate": 9.649273033280399e-07, + "logits/chosen": -0.9488300085067749, + "logits/rejected": -0.9138250350952148, + "logps/chosen": -42.005889892578125, + "logps/rejected": -77.50165557861328, + "loss": 0.2053, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.3961612582206726, + "rewards/margins": 2.9810876846313477, + "rewards/rejected": -2.584926128387451, + "step": 274 + }, + { + "epoch": 0.42, + "learning_rate": 9.644375433964878e-07, + "logits/chosen": -1.3720015287399292, + "logits/rejected": -1.371315360069275, + "logps/chosen": -63.39384078979492, + "logps/rejected": -117.76875305175781, + "loss": 0.2601, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.0266212597489357, + "rewards/margins": 3.921591281890869, + "rewards/rejected": -3.894970417022705, + "step": 275 + }, + { + "epoch": 0.42, + "learning_rate": 9.639445136482546e-07, + "logits/chosen": -1.2813271284103394, + "logits/rejected": -1.2421196699142456, + "logps/chosen": -49.05527877807617, + "logps/rejected": -90.07084655761719, + "loss": 0.2312, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.08839581906795502, + "rewards/margins": 3.386387825012207, + "rewards/rejected": -3.2979917526245117, + "step": 276 + }, + { + "epoch": 0.42, + "learning_rate": 9.634482175544572e-07, + "logits/chosen": -1.1708801984786987, + "logits/rejected": -1.0792722702026367, + "logps/chosen": -59.987640380859375, + "logps/rejected": -119.44989776611328, + "loss": 0.1914, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.23230049014091492, + "rewards/margins": 3.9318790435791016, + "rewards/rejected": -3.6995785236358643, + "step": 277 + }, + { + "epoch": 0.42, + "learning_rate": 9.629486586092086e-07, + "logits/chosen": -1.2014925479888916, + "logits/rejected": -1.2414860725402832, + "logps/chosen": -69.78931427001953, + "logps/rejected": -132.54397583007812, + "loss": 0.2746, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.3791479468345642, + "rewards/margins": 4.3308329582214355, + "rewards/rejected": -4.7099809646606445, + "step": 278 + }, + { + "epoch": 0.42, + "learning_rate": 9.624458403295934e-07, + "logits/chosen": -1.213075876235962, + "logits/rejected": -1.19390070438385, + "logps/chosen": -43.44424057006836, + "logps/rejected": -76.5701675415039, + "loss": 0.2144, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.0699257180094719, + "rewards/margins": 1.9740471839904785, + "rewards/rejected": -2.0439724922180176, + "step": 279 + }, + { + "epoch": 0.43, + "learning_rate": 9.619397662556433e-07, + "logits/chosen": -0.9972809553146362, + "logits/rejected": -1.01755952835083, + "logps/chosen": -40.813270568847656, + "logps/rejected": -62.96757507324219, + "loss": 0.2012, + "rewards/accuracies": 0.8125, + "rewards/chosen": 0.38587141036987305, + "rewards/margins": 2.0120716094970703, + "rewards/rejected": -1.6262001991271973, + "step": 280 + }, + { + "epoch": 0.43, + "learning_rate": 9.614304399503119e-07, + "logits/chosen": -1.2082014083862305, + "logits/rejected": -1.1905056238174438, + "logps/chosen": -66.61445617675781, + "logps/rejected": -118.85020446777344, + "loss": 0.1423, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.19420230388641357, + "rewards/margins": 4.226605415344238, + "rewards/rejected": -4.032402515411377, + "step": 281 + }, + { + "epoch": 0.43, + "learning_rate": 9.609178649994497e-07, + "logits/chosen": -1.3044588565826416, + "logits/rejected": -1.3541276454925537, + "logps/chosen": -52.766082763671875, + "logps/rejected": -90.2623519897461, + "loss": 0.1909, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.12392090260982513, + "rewards/margins": 3.198810577392578, + "rewards/rejected": -3.3227314949035645, + "step": 282 + }, + { + "epoch": 0.43, + "learning_rate": 9.604020450117795e-07, + "logits/chosen": -1.1252713203430176, + "logits/rejected": -1.1225484609603882, + "logps/chosen": -54.23025131225586, + "logps/rejected": -95.14872741699219, + "loss": 0.21, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.1901187002658844, + "rewards/margins": 2.9272286891937256, + "rewards/rejected": -2.737109899520874, + "step": 283 + }, + { + "epoch": 0.43, + "learning_rate": 9.598829836188693e-07, + "logits/chosen": -1.232951283454895, + "logits/rejected": -1.172524333000183, + "logps/chosen": -41.288700103759766, + "logps/rejected": -89.16956329345703, + "loss": 0.2321, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.7929279804229736, + "rewards/margins": 3.3749027252197266, + "rewards/rejected": -2.581974983215332, + "step": 284 + }, + { + "epoch": 0.43, + "learning_rate": 9.593606844751088e-07, + "logits/chosen": -1.143338918685913, + "logits/rejected": -1.110314965248108, + "logps/chosen": -53.824607849121094, + "logps/rejected": -98.65978240966797, + "loss": 0.1841, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.462774395942688, + "rewards/margins": 3.597409248352051, + "rewards/rejected": -3.1346347332000732, + "step": 285 + }, + { + "epoch": 0.43, + "learning_rate": 9.588351512576822e-07, + "logits/chosen": -1.1007797718048096, + "logits/rejected": -1.1043678522109985, + "logps/chosen": -46.744407653808594, + "logps/rejected": -114.91055297851562, + "loss": 0.1739, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.7569597959518433, + "rewards/margins": 5.05133581161499, + "rewards/rejected": -4.294375896453857, + "step": 286 + }, + { + "epoch": 0.44, + "learning_rate": 9.583063876665427e-07, + "logits/chosen": -0.8524032831192017, + "logits/rejected": -0.8362274169921875, + "logps/chosen": -47.458534240722656, + "logps/rejected": -105.9702377319336, + "loss": 0.2093, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.144188404083252, + "rewards/margins": 5.193797588348389, + "rewards/rejected": -4.049609184265137, + "step": 287 + }, + { + "epoch": 0.44, + "learning_rate": 9.577743974243872e-07, + "logits/chosen": -1.138148546218872, + "logits/rejected": -1.0631217956542969, + "logps/chosen": -50.755393981933594, + "logps/rejected": -98.93721771240234, + "loss": 0.1719, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.38429662585258484, + "rewards/margins": 3.2172207832336426, + "rewards/rejected": -3.6015172004699707, + "step": 288 + }, + { + "epoch": 0.44, + "learning_rate": 9.572391842766289e-07, + "logits/chosen": -1.3621189594268799, + "logits/rejected": -1.4565434455871582, + "logps/chosen": -43.73139572143555, + "logps/rejected": -75.05216217041016, + "loss": 0.1819, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.4174540638923645, + "rewards/margins": 3.1869609355926514, + "rewards/rejected": -2.7695069313049316, + "step": 289 + }, + { + "epoch": 0.44, + "learning_rate": 9.567007519913716e-07, + "logits/chosen": -1.2581207752227783, + "logits/rejected": -1.2855815887451172, + "logps/chosen": -57.83710861206055, + "logps/rejected": -90.186279296875, + "loss": 0.1771, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.07886456698179245, + "rewards/margins": 2.7510035037994385, + "rewards/rejected": -2.8298683166503906, + "step": 290 + }, + { + "epoch": 0.44, + "learning_rate": 9.561591043593827e-07, + "logits/chosen": -1.5122599601745605, + "logits/rejected": -1.458139181137085, + "logps/chosen": -64.88407135009766, + "logps/rejected": -127.58905029296875, + "loss": 0.2801, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.14722773432731628, + "rewards/margins": 4.665119647979736, + "rewards/rejected": -4.812347888946533, + "step": 291 + }, + { + "epoch": 0.44, + "learning_rate": 9.556142451940679e-07, + "logits/chosen": -1.184052586555481, + "logits/rejected": -1.2342268228530884, + "logps/chosen": -89.00273895263672, + "logps/rejected": -147.20021057128906, + "loss": 0.1953, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.8906569480895996, + "rewards/margins": 5.703664779663086, + "rewards/rejected": -6.594321250915527, + "step": 292 + }, + { + "epoch": 0.45, + "learning_rate": 9.55066178331442e-07, + "logits/chosen": -1.000017523765564, + "logits/rejected": -0.9489910006523132, + "logps/chosen": -66.79763793945312, + "logps/rejected": -114.90792083740234, + "loss": 0.2076, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.5531197786331177, + "rewards/margins": 3.8519601821899414, + "rewards/rejected": -4.4050798416137695, + "step": 293 + }, + { + "epoch": 0.45, + "learning_rate": 9.545149076301043e-07, + "logits/chosen": -1.0007249116897583, + "logits/rejected": -1.0427497625350952, + "logps/chosen": -57.55965805053711, + "logps/rejected": -101.47200012207031, + "loss": 0.2055, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.12303817272186279, + "rewards/margins": 4.29061222076416, + "rewards/rejected": -4.413650035858154, + "step": 294 + }, + { + "epoch": 0.45, + "learning_rate": 9.539604369712098e-07, + "logits/chosen": -1.0703601837158203, + "logits/rejected": -1.0638940334320068, + "logps/chosen": -82.14090728759766, + "logps/rejected": -144.56829833984375, + "loss": 0.2249, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.1805057525634766, + "rewards/margins": 5.812443733215332, + "rewards/rejected": -6.992949485778809, + "step": 295 + }, + { + "epoch": 0.45, + "learning_rate": 9.534027702584424e-07, + "logits/chosen": -1.241573691368103, + "logits/rejected": -1.092793345451355, + "logps/chosen": -51.69071960449219, + "logps/rejected": -119.88025665283203, + "loss": 0.229, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.14432412385940552, + "rewards/margins": 4.62017297744751, + "rewards/rejected": -4.76449728012085, + "step": 296 + }, + { + "epoch": 0.45, + "learning_rate": 9.528419114179876e-07, + "logits/chosen": -1.3773949146270752, + "logits/rejected": -1.326578140258789, + "logps/chosen": -66.81124114990234, + "logps/rejected": -117.5950927734375, + "loss": 0.2108, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.232625961303711, + "rewards/margins": 3.5316824913024902, + "rewards/rejected": -4.764308452606201, + "step": 297 + }, + { + "epoch": 0.45, + "learning_rate": 9.522778643985044e-07, + "logits/chosen": -1.1971187591552734, + "logits/rejected": -1.2293506860733032, + "logps/chosen": -73.28286743164062, + "logps/rejected": -128.9477081298828, + "loss": 0.2332, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.6712610721588135, + "rewards/margins": 5.412903308868408, + "rewards/rejected": -6.084164619445801, + "step": 298 + }, + { + "epoch": 0.45, + "learning_rate": 9.517106331710984e-07, + "logits/chosen": -1.168115496635437, + "logits/rejected": -0.9988553524017334, + "logps/chosen": -62.709564208984375, + "logps/rejected": -160.54025268554688, + "loss": 0.1577, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.40043070912361145, + "rewards/margins": 6.569068908691406, + "rewards/rejected": -6.168639183044434, + "step": 299 + }, + { + "epoch": 0.46, + "learning_rate": 9.511402217292925e-07, + "logits/chosen": -1.0160331726074219, + "logits/rejected": -0.9988321661949158, + "logps/chosen": -58.68785095214844, + "logps/rejected": -117.34202575683594, + "loss": 0.1469, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.07157469540834427, + "rewards/margins": 5.047296047210693, + "rewards/rejected": -5.118870735168457, + "step": 300 + }, + { + "epoch": 0.46, + "learning_rate": 9.505666340890002e-07, + "logits/chosen": -1.2377873659133911, + "logits/rejected": -1.3541501760482788, + "logps/chosen": -52.6754264831543, + "logps/rejected": -105.81607818603516, + "loss": 0.2249, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.6247602701187134, + "rewards/margins": 4.983515739440918, + "rewards/rejected": -4.358755588531494, + "step": 301 + }, + { + "epoch": 0.46, + "learning_rate": 9.499898742884962e-07, + "logits/chosen": -1.3011903762817383, + "logits/rejected": -1.27763831615448, + "logps/chosen": -57.895233154296875, + "logps/rejected": -112.72341918945312, + "loss": 0.228, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.05821986496448517, + "rewards/margins": 4.705685138702393, + "rewards/rejected": -4.763904571533203, + "step": 302 + }, + { + "epoch": 0.46, + "learning_rate": 9.494099463883884e-07, + "logits/chosen": -1.1723133325576782, + "logits/rejected": -1.2371587753295898, + "logps/chosen": -77.86163330078125, + "logps/rejected": -140.04586791992188, + "loss": 0.1893, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.4797315001487732, + "rewards/margins": 5.164061546325684, + "rewards/rejected": -5.643793106079102, + "step": 303 + }, + { + "epoch": 0.46, + "learning_rate": 9.488268544715895e-07, + "logits/chosen": -1.2617186307907104, + "logits/rejected": -1.282090187072754, + "logps/chosen": -67.55493927001953, + "logps/rejected": -151.58358764648438, + "loss": 0.219, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.05920267105102539, + "rewards/margins": 6.089982509613037, + "rewards/rejected": -6.149186134338379, + "step": 304 + }, + { + "epoch": 0.46, + "learning_rate": 9.48240602643288e-07, + "logits/chosen": -1.101471185684204, + "logits/rejected": -1.0918614864349365, + "logps/chosen": -65.73365783691406, + "logps/rejected": -136.63345336914062, + "loss": 0.1837, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.6864421367645264, + "rewards/margins": 5.417891502380371, + "rewards/rejected": -6.104333877563477, + "step": 305 + }, + { + "epoch": 0.46, + "learning_rate": 9.476511950309197e-07, + "logits/chosen": -1.332780122756958, + "logits/rejected": -1.2241826057434082, + "logps/chosen": -93.1464614868164, + "logps/rejected": -171.5055389404297, + "loss": 0.2151, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.9534928798675537, + "rewards/margins": 5.41372537612915, + "rewards/rejected": -7.367218494415283, + "step": 306 + }, + { + "epoch": 0.47, + "learning_rate": 9.470586357841377e-07, + "logits/chosen": -1.3806670904159546, + "logits/rejected": -1.364810585975647, + "logps/chosen": -80.15160369873047, + "logps/rejected": -118.8318862915039, + "loss": 0.1924, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.4739937782287598, + "rewards/margins": 3.5889570713043213, + "rewards/rejected": -5.06295108795166, + "step": 307 + }, + { + "epoch": 0.47, + "learning_rate": 9.464629290747842e-07, + "logits/chosen": -1.0424054861068726, + "logits/rejected": -1.0304687023162842, + "logps/chosen": -57.2526969909668, + "logps/rejected": -138.89817810058594, + "loss": 0.1773, + "rewards/accuracies": 0.8125, + "rewards/chosen": 0.1174016147851944, + "rewards/margins": 6.3627142906188965, + "rewards/rejected": -6.2453131675720215, + "step": 308 + }, + { + "epoch": 0.47, + "learning_rate": 9.458640790968606e-07, + "logits/chosen": -1.050306797027588, + "logits/rejected": -0.9519209265708923, + "logps/chosen": -81.77310180664062, + "logps/rejected": -163.30325317382812, + "loss": 0.2278, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.8570660948753357, + "rewards/margins": 6.205819606781006, + "rewards/rejected": -7.0628862380981445, + "step": 309 + }, + { + "epoch": 0.47, + "learning_rate": 9.452620900664985e-07, + "logits/chosen": -1.0299391746520996, + "logits/rejected": -0.9257128238677979, + "logps/chosen": -52.693416595458984, + "logps/rejected": -125.26481628417969, + "loss": 0.1896, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.04881918430328369, + "rewards/margins": 5.693671703338623, + "rewards/rejected": -5.742491722106934, + "step": 310 + }, + { + "epoch": 0.47, + "learning_rate": 9.446569662219288e-07, + "logits/chosen": -1.1465742588043213, + "logits/rejected": -1.0597347021102905, + "logps/chosen": -73.81082153320312, + "logps/rejected": -138.83419799804688, + "loss": 0.2467, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.1154011487960815, + "rewards/margins": 4.6926374435424805, + "rewards/rejected": -5.80803918838501, + "step": 311 + }, + { + "epoch": 0.47, + "learning_rate": 9.440487118234534e-07, + "logits/chosen": -1.2268579006195068, + "logits/rejected": -1.2430446147918701, + "logps/chosen": -67.38180541992188, + "logps/rejected": -122.89826965332031, + "loss": 0.1632, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.8113257884979248, + "rewards/margins": 4.62271785736084, + "rewards/rejected": -5.434043884277344, + "step": 312 + }, + { + "epoch": 0.48, + "learning_rate": 9.434373311534145e-07, + "logits/chosen": -1.173991084098816, + "logits/rejected": -1.0881710052490234, + "logps/chosen": -60.490787506103516, + "logps/rejected": -127.25591278076172, + "loss": 0.2422, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.091408371925354, + "rewards/margins": 4.285440921783447, + "rewards/rejected": -5.3768486976623535, + "step": 313 + }, + { + "epoch": 0.48, + "learning_rate": 9.428228285161638e-07, + "logits/chosen": -1.2301355600357056, + "logits/rejected": -1.164600133895874, + "logps/chosen": -70.10157012939453, + "logps/rejected": -118.09928131103516, + "loss": 0.2747, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.4453577995300293, + "rewards/margins": 4.48175573348999, + "rewards/rejected": -4.9271135330200195, + "step": 314 + }, + { + "epoch": 0.48, + "learning_rate": 9.422052082380334e-07, + "logits/chosen": -1.1066889762878418, + "logits/rejected": -1.1147856712341309, + "logps/chosen": -62.315975189208984, + "logps/rejected": -130.76109313964844, + "loss": 0.1691, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.3953535556793213, + "rewards/margins": 5.75650691986084, + "rewards/rejected": -6.151860237121582, + "step": 315 + }, + { + "epoch": 0.48, + "learning_rate": 9.415844746673046e-07, + "logits/chosen": -1.182931661605835, + "logits/rejected": -1.141818881034851, + "logps/chosen": -64.85267639160156, + "logps/rejected": -149.100830078125, + "loss": 0.2965, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.32952946424484253, + "rewards/margins": 6.445108413696289, + "rewards/rejected": -6.7746381759643555, + "step": 316 + }, + { + "epoch": 0.48, + "learning_rate": 9.409606321741774e-07, + "logits/chosen": -0.9379570484161377, + "logits/rejected": -0.9440093636512756, + "logps/chosen": -55.83263397216797, + "logps/rejected": -94.1610336303711, + "loss": 0.272, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.3002450466156006, + "rewards/margins": 3.7583513259887695, + "rewards/rejected": -4.058596134185791, + "step": 317 + }, + { + "epoch": 0.48, + "learning_rate": 9.4033368515074e-07, + "logits/chosen": -0.9900854825973511, + "logits/rejected": -1.0676978826522827, + "logps/chosen": -59.72868347167969, + "logps/rejected": -94.28669738769531, + "loss": 0.1464, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.35381263494491577, + "rewards/margins": 3.47701358795166, + "rewards/rejected": -3.8308260440826416, + "step": 318 + }, + { + "epoch": 0.48, + "learning_rate": 9.397036380109376e-07, + "logits/chosen": -1.237237572669983, + "logits/rejected": -1.2699791193008423, + "logps/chosen": -62.2976188659668, + "logps/rejected": -114.91215515136719, + "loss": 0.2888, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.7755670547485352, + "rewards/margins": 4.2638468742370605, + "rewards/rejected": -5.039413928985596, + "step": 319 + }, + { + "epoch": 0.49, + "learning_rate": 9.390704951905411e-07, + "logits/chosen": -1.0435210466384888, + "logits/rejected": -1.0626487731933594, + "logps/chosen": -54.04743957519531, + "logps/rejected": -99.78398132324219, + "loss": 0.2092, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.44715026021003723, + "rewards/margins": 4.157877445220947, + "rewards/rejected": -4.60502815246582, + "step": 320 + }, + { + "epoch": 0.49, + "learning_rate": 9.384342611471164e-07, + "logits/chosen": -1.2864059209823608, + "logits/rejected": -1.2670793533325195, + "logps/chosen": -58.702476501464844, + "logps/rejected": -120.93059539794922, + "loss": 0.1706, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.18780693411827087, + "rewards/margins": 4.7762651443481445, + "rewards/rejected": -4.964072227478027, + "step": 321 + }, + { + "epoch": 0.49, + "learning_rate": 9.377949403599927e-07, + "logits/chosen": -1.1650375127792358, + "logits/rejected": -1.25117826461792, + "logps/chosen": -62.39760971069336, + "logps/rejected": -106.74618530273438, + "loss": 0.2156, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.0015542954206466675, + "rewards/margins": 3.9890871047973633, + "rewards/rejected": -3.9906415939331055, + "step": 322 + }, + { + "epoch": 0.49, + "learning_rate": 9.371525373302316e-07, + "logits/chosen": -1.234785556793213, + "logits/rejected": -1.229178547859192, + "logps/chosen": -36.128318786621094, + "logps/rejected": -83.70571899414062, + "loss": 0.1872, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.3476123809814453, + "rewards/margins": 3.839707612991333, + "rewards/rejected": -3.4920952320098877, + "step": 323 + }, + { + "epoch": 0.49, + "learning_rate": 9.36507056580594e-07, + "logits/chosen": -1.111757516860962, + "logits/rejected": -1.0463680028915405, + "logps/chosen": -60.20932388305664, + "logps/rejected": -113.43062591552734, + "loss": 0.1969, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.5552247762680054, + "rewards/margins": 4.3409104347229, + "rewards/rejected": -4.896135330200195, + "step": 324 + }, + { + "epoch": 0.49, + "learning_rate": 9.358585026555097e-07, + "logits/chosen": -0.9682977199554443, + "logits/rejected": -1.0178247690200806, + "logps/chosen": -41.273834228515625, + "logps/rejected": -77.8062515258789, + "loss": 0.2324, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.5879894495010376, + "rewards/margins": 3.407402992248535, + "rewards/rejected": -2.819413661956787, + "step": 325 + }, + { + "epoch": 0.5, + "learning_rate": 9.352068801210444e-07, + "logits/chosen": -1.2803910970687866, + "logits/rejected": -1.3069097995758057, + "logps/chosen": -64.29356384277344, + "logps/rejected": -111.56470489501953, + "loss": 0.2316, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.3384580612182617, + "rewards/margins": 4.075242519378662, + "rewards/rejected": -4.413699626922607, + "step": 326 + }, + { + "epoch": 0.5, + "learning_rate": 9.345521935648684e-07, + "logits/chosen": -1.156507968902588, + "logits/rejected": -1.0567623376846313, + "logps/chosen": -62.299598693847656, + "logps/rejected": -154.32818603515625, + "loss": 0.2469, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.4583524465560913, + "rewards/margins": 6.333713054656982, + "rewards/rejected": -6.792065620422363, + "step": 327 + }, + { + "epoch": 0.5, + "learning_rate": 9.338944475962236e-07, + "logits/chosen": -1.2198809385299683, + "logits/rejected": -1.1411675214767456, + "logps/chosen": -68.92489624023438, + "logps/rejected": -138.6844940185547, + "loss": 0.1705, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.7334094643592834, + "rewards/margins": 5.524423599243164, + "rewards/rejected": -6.257833480834961, + "step": 328 + }, + { + "epoch": 0.5, + "learning_rate": 9.332336468458913e-07, + "logits/chosen": -1.2047687768936157, + "logits/rejected": -1.1440612077713013, + "logps/chosen": -69.50259399414062, + "logps/rejected": -136.95941162109375, + "loss": 0.2702, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.1296145915985107, + "rewards/margins": 4.767106533050537, + "rewards/rejected": -5.896721363067627, + "step": 329 + }, + { + "epoch": 0.5, + "learning_rate": 9.325697959661601e-07, + "logits/chosen": -1.1211073398590088, + "logits/rejected": -1.1430073976516724, + "logps/chosen": -68.41262817382812, + "logps/rejected": -147.62721252441406, + "loss": 0.1273, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.4298883080482483, + "rewards/margins": 6.240133762359619, + "rewards/rejected": -6.670022487640381, + "step": 330 + }, + { + "epoch": 0.5, + "learning_rate": 9.319028996307918e-07, + "logits/chosen": -1.2578020095825195, + "logits/rejected": -1.253657579421997, + "logps/chosen": -68.27984619140625, + "logps/rejected": -114.69712829589844, + "loss": 0.1736, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.4018743932247162, + "rewards/margins": 4.116807460784912, + "rewards/rejected": -4.518681526184082, + "step": 331 + }, + { + "epoch": 0.5, + "learning_rate": 9.312329625349901e-07, + "logits/chosen": -1.149940848350525, + "logits/rejected": -1.1540448665618896, + "logps/chosen": -74.48561096191406, + "logps/rejected": -133.59381103515625, + "loss": 0.1977, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.7744425535202026, + "rewards/margins": 4.682015419006348, + "rewards/rejected": -5.45645809173584, + "step": 332 + }, + { + "epoch": 0.51, + "learning_rate": 9.305599893953669e-07, + "logits/chosen": -1.065702199935913, + "logits/rejected": -0.961828887462616, + "logps/chosen": -75.67964172363281, + "logps/rejected": -139.41537475585938, + "loss": 0.2296, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.7512010335922241, + "rewards/margins": 4.466395378112793, + "rewards/rejected": -6.217596530914307, + "step": 333 + }, + { + "epoch": 0.51, + "learning_rate": 9.298839849499081e-07, + "logits/chosen": -1.1605356931686401, + "logits/rejected": -1.2287828922271729, + "logps/chosen": -92.33887481689453, + "logps/rejected": -138.165771484375, + "loss": 0.285, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.864888072013855, + "rewards/margins": 4.075357437133789, + "rewards/rejected": -5.940246105194092, + "step": 334 + }, + { + "epoch": 0.51, + "learning_rate": 9.29204953957942e-07, + "logits/chosen": -0.9757486581802368, + "logits/rejected": -0.9516485333442688, + "logps/chosen": -70.66392517089844, + "logps/rejected": -120.23057556152344, + "loss": 0.1818, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.0067274570465088, + "rewards/margins": 4.125641822814941, + "rewards/rejected": -5.1323699951171875, + "step": 335 + }, + { + "epoch": 0.51, + "learning_rate": 9.285229012001046e-07, + "logits/chosen": -1.2529933452606201, + "logits/rejected": -1.1360479593276978, + "logps/chosen": -66.08387756347656, + "logps/rejected": -158.76080322265625, + "loss": 0.1779, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.6589143872261047, + "rewards/margins": 6.218050479888916, + "rewards/rejected": -6.876964092254639, + "step": 336 + }, + { + "epoch": 0.51, + "learning_rate": 9.278378314783064e-07, + "logits/chosen": -1.018735647201538, + "logits/rejected": -0.9996989369392395, + "logps/chosen": -62.99300003051758, + "logps/rejected": -109.50791931152344, + "loss": 0.221, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.2736590504646301, + "rewards/margins": 4.201366424560547, + "rewards/rejected": -4.4750261306762695, + "step": 337 + }, + { + "epoch": 0.51, + "learning_rate": 9.271497496156983e-07, + "logits/chosen": -1.1342700719833374, + "logits/rejected": -1.0868401527404785, + "logps/chosen": -76.54375457763672, + "logps/rejected": -150.23513793945312, + "loss": 0.2337, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.5463071465492249, + "rewards/margins": 6.337887763977051, + "rewards/rejected": -6.884194850921631, + "step": 338 + }, + { + "epoch": 0.52, + "learning_rate": 9.26458660456638e-07, + "logits/chosen": -1.1688416004180908, + "logits/rejected": -1.167934536933899, + "logps/chosen": -51.82984161376953, + "logps/rejected": -99.10894012451172, + "loss": 0.2227, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.23706474900245667, + "rewards/margins": 3.779388904571533, + "rewards/rejected": -3.5423243045806885, + "step": 339 + }, + { + "epoch": 0.52, + "learning_rate": 9.257645688666555e-07, + "logits/chosen": -1.1728771924972534, + "logits/rejected": -1.1348259449005127, + "logps/chosen": -74.75570678710938, + "logps/rejected": -150.23471069335938, + "loss": 0.1724, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.3358078002929688, + "rewards/margins": 5.848766803741455, + "rewards/rejected": -7.184574604034424, + "step": 340 + }, + { + "epoch": 0.52, + "learning_rate": 9.250674797324196e-07, + "logits/chosen": -1.222875714302063, + "logits/rejected": -1.204736590385437, + "logps/chosen": -71.1811752319336, + "logps/rejected": -137.2476806640625, + "loss": 0.1718, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.6591771245002747, + "rewards/margins": 5.725841522216797, + "rewards/rejected": -6.385018825531006, + "step": 341 + }, + { + "epoch": 0.52, + "learning_rate": 9.243673979617019e-07, + "logits/chosen": -1.2629741430282593, + "logits/rejected": -1.2538414001464844, + "logps/chosen": -85.21867370605469, + "logps/rejected": -159.30917358398438, + "loss": 0.2231, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.6282932758331299, + "rewards/margins": 6.88715124130249, + "rewards/rejected": -7.515444278717041, + "step": 342 + }, + { + "epoch": 0.52, + "learning_rate": 9.236643284833445e-07, + "logits/chosen": -1.2181004285812378, + "logits/rejected": -1.2108968496322632, + "logps/chosen": -68.36640930175781, + "logps/rejected": -135.7064208984375, + "loss": 0.1703, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.9516810178756714, + "rewards/margins": 5.099062442779541, + "rewards/rejected": -6.050743103027344, + "step": 343 + }, + { + "epoch": 0.52, + "learning_rate": 9.22958276247223e-07, + "logits/chosen": -1.1332156658172607, + "logits/rejected": -1.049139380455017, + "logps/chosen": -74.32099914550781, + "logps/rejected": -163.8959503173828, + "loss": 0.1956, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.6016435623168945, + "rewards/margins": 7.112662315368652, + "rewards/rejected": -7.714305877685547, + "step": 344 + }, + { + "epoch": 0.52, + "learning_rate": 9.222492462242137e-07, + "logits/chosen": -1.0189480781555176, + "logits/rejected": -0.9224266409873962, + "logps/chosen": -85.83133697509766, + "logps/rejected": -165.11000061035156, + "loss": 0.2467, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.4347052574157715, + "rewards/margins": 6.352794647216797, + "rewards/rejected": -7.787499904632568, + "step": 345 + }, + { + "epoch": 0.53, + "learning_rate": 9.215372434061572e-07, + "logits/chosen": -0.9589766263961792, + "logits/rejected": -0.9576123952865601, + "logps/chosen": -51.86228561401367, + "logps/rejected": -117.89081573486328, + "loss": 0.1868, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.08749858289957047, + "rewards/margins": 5.508365154266357, + "rewards/rejected": -5.420866966247559, + "step": 346 + }, + { + "epoch": 0.53, + "learning_rate": 9.208222728058235e-07, + "logits/chosen": -1.132300615310669, + "logits/rejected": -1.0416042804718018, + "logps/chosen": -65.39353942871094, + "logps/rejected": -144.50401306152344, + "loss": 0.2118, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.678921103477478, + "rewards/margins": 5.8557209968566895, + "rewards/rejected": -6.534642219543457, + "step": 347 + }, + { + "epoch": 0.53, + "learning_rate": 9.201043394568771e-07, + "logits/chosen": -1.2169915437698364, + "logits/rejected": -1.121902346611023, + "logps/chosen": -68.50205993652344, + "logps/rejected": -144.74172973632812, + "loss": 0.2686, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.5129822492599487, + "rewards/margins": 6.310784339904785, + "rewards/rejected": -6.823767185211182, + "step": 348 + }, + { + "epoch": 0.53, + "learning_rate": 9.193834484138417e-07, + "logits/chosen": -1.0599150657653809, + "logits/rejected": -1.107330083847046, + "logps/chosen": -62.24093246459961, + "logps/rejected": -143.25433349609375, + "loss": 0.251, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.33035796880722046, + "rewards/margins": 6.537755489349365, + "rewards/rejected": -6.8681135177612305, + "step": 349 + }, + { + "epoch": 0.53, + "learning_rate": 9.186596047520638e-07, + "logits/chosen": -1.3434659242630005, + "logits/rejected": -1.2892396450042725, + "logps/chosen": -60.81199264526367, + "logps/rejected": -114.70281982421875, + "loss": 0.2041, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.15349704027175903, + "rewards/margins": 4.696380615234375, + "rewards/rejected": -4.84987735748291, + "step": 350 + }, + { + "epoch": 0.53, + "learning_rate": 9.179328135676778e-07, + "logits/chosen": -1.1484850645065308, + "logits/rejected": -1.020574927330017, + "logps/chosen": -59.90319061279297, + "logps/rejected": -120.53746795654297, + "loss": 0.1556, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.6321442723274231, + "rewards/margins": 4.3459978103637695, + "rewards/rejected": -4.978141784667969, + "step": 351 + }, + { + "epoch": 0.53, + "learning_rate": 9.172030799775698e-07, + "logits/chosen": -1.0592190027236938, + "logits/rejected": -1.0199298858642578, + "logps/chosen": -53.4327392578125, + "logps/rejected": -110.98649597167969, + "loss": 0.2109, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.19057221710681915, + "rewards/margins": 5.060029983520508, + "rewards/rejected": -4.869457721710205, + "step": 352 + }, + { + "epoch": 0.54, + "learning_rate": 9.16470409119341e-07, + "logits/chosen": -0.8089007139205933, + "logits/rejected": -0.7608518004417419, + "logps/chosen": -88.42681884765625, + "logps/rejected": -136.05258178710938, + "loss": 0.121, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.1394758224487305, + "rewards/margins": 4.462666988372803, + "rewards/rejected": -6.602142333984375, + "step": 353 + }, + { + "epoch": 0.54, + "learning_rate": 9.157348061512726e-07, + "logits/chosen": -1.0326448678970337, + "logits/rejected": -0.95633465051651, + "logps/chosen": -50.31575012207031, + "logps/rejected": -92.37877655029297, + "loss": 0.228, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.07662263512611389, + "rewards/margins": 3.4256300926208496, + "rewards/rejected": -3.5022528171539307, + "step": 354 + }, + { + "epoch": 0.54, + "learning_rate": 9.149962762522889e-07, + "logits/chosen": -1.1547660827636719, + "logits/rejected": -1.166226863861084, + "logps/chosen": -68.40359497070312, + "logps/rejected": -98.83624267578125, + "loss": 0.2265, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.9433976411819458, + "rewards/margins": 3.1959171295166016, + "rewards/rejected": -4.139314651489258, + "step": 355 + }, + { + "epoch": 0.54, + "learning_rate": 9.14254824621921e-07, + "logits/chosen": -1.0819573402404785, + "logits/rejected": -1.0202829837799072, + "logps/chosen": -44.65376281738281, + "logps/rejected": -94.65969848632812, + "loss": 0.2218, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.07252389192581177, + "rewards/margins": 3.7984731197357178, + "rewards/rejected": -3.8709967136383057, + "step": 356 + }, + { + "epoch": 0.54, + "learning_rate": 9.135104564802698e-07, + "logits/chosen": -1.1751701831817627, + "logits/rejected": -1.098080039024353, + "logps/chosen": -84.35014343261719, + "logps/rejected": -157.07864379882812, + "loss": 0.2225, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.8365371227264404, + "rewards/margins": 5.801826477050781, + "rewards/rejected": -7.638364315032959, + "step": 357 + }, + { + "epoch": 0.54, + "learning_rate": 9.127631770679697e-07, + "logits/chosen": -0.8695126175880432, + "logits/rejected": -0.8332204818725586, + "logps/chosen": -78.99563598632812, + "logps/rejected": -179.3597412109375, + "loss": 0.1488, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.2126437425613403, + "rewards/margins": 7.655306339263916, + "rewards/rejected": -8.867949485778809, + "step": 358 + }, + { + "epoch": 0.55, + "learning_rate": 9.120129916461516e-07, + "logits/chosen": -1.0260288715362549, + "logits/rejected": -0.9362490773200989, + "logps/chosen": -70.77383422851562, + "logps/rejected": -133.34637451171875, + "loss": 0.2292, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.0889973640441895, + "rewards/margins": 5.2399444580078125, + "rewards/rejected": -6.328941822052002, + "step": 359 + }, + { + "epoch": 0.55, + "learning_rate": 9.112599054964057e-07, + "logits/chosen": -1.1575307846069336, + "logits/rejected": -1.1207451820373535, + "logps/chosen": -61.183433532714844, + "logps/rejected": -120.67449188232422, + "loss": 0.1292, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.23736169934272766, + "rewards/margins": 4.948629379272461, + "rewards/rejected": -5.185991287231445, + "step": 360 + }, + { + "epoch": 0.55, + "learning_rate": 9.105039239207446e-07, + "logits/chosen": -0.8983496427536011, + "logits/rejected": -0.8002632260322571, + "logps/chosen": -60.88401794433594, + "logps/rejected": -127.92584228515625, + "loss": 0.249, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.4497146010398865, + "rewards/margins": 5.143093109130859, + "rewards/rejected": -5.592807769775391, + "step": 361 + }, + { + "epoch": 0.55, + "learning_rate": 9.097450522415655e-07, + "logits/chosen": -1.0423619747161865, + "logits/rejected": -1.0030453205108643, + "logps/chosen": -69.3251953125, + "logps/rejected": -130.1721954345703, + "loss": 0.2145, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.3751425743103027, + "rewards/margins": 4.741582870483398, + "rewards/rejected": -6.116725444793701, + "step": 362 + }, + { + "epoch": 0.55, + "learning_rate": 9.089832958016135e-07, + "logits/chosen": -0.9802379608154297, + "logits/rejected": -0.7710241079330444, + "logps/chosen": -54.990028381347656, + "logps/rejected": -149.1652374267578, + "loss": 0.1517, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.02962431311607361, + "rewards/margins": 7.011379718780518, + "rewards/rejected": -7.041004657745361, + "step": 363 + }, + { + "epoch": 0.55, + "learning_rate": 9.082186599639427e-07, + "logits/chosen": -1.2180869579315186, + "logits/rejected": -1.288204550743103, + "logps/chosen": -63.243385314941406, + "logps/rejected": -113.26071166992188, + "loss": 0.1491, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.6312990784645081, + "rewards/margins": 4.588357448577881, + "rewards/rejected": -5.219655990600586, + "step": 364 + }, + { + "epoch": 0.55, + "learning_rate": 9.074511501118805e-07, + "logits/chosen": -1.1787549257278442, + "logits/rejected": -1.2597904205322266, + "logps/chosen": -68.79399871826172, + "logps/rejected": -132.29443359375, + "loss": 0.1933, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.25832653045654297, + "rewards/margins": 5.924032211303711, + "rewards/rejected": -6.182358741760254, + "step": 365 + }, + { + "epoch": 0.56, + "learning_rate": 9.066807716489871e-07, + "logits/chosen": -0.8737149238586426, + "logits/rejected": -0.8693994283676147, + "logps/chosen": -47.5757942199707, + "logps/rejected": -100.08030700683594, + "loss": 0.1697, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.3265417218208313, + "rewards/margins": 4.437649726867676, + "rewards/rejected": -4.11110782623291, + "step": 366 + }, + { + "epoch": 0.56, + "learning_rate": 9.059075299990197e-07, + "logits/chosen": -0.7666571140289307, + "logits/rejected": -0.7947217226028442, + "logps/chosen": -50.94587707519531, + "logps/rejected": -99.97550201416016, + "loss": 0.1475, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.01705726981163025, + "rewards/margins": 4.673468589782715, + "rewards/rejected": -4.656411647796631, + "step": 367 + }, + { + "epoch": 0.56, + "learning_rate": 9.051314306058933e-07, + "logits/chosen": -1.194947600364685, + "logits/rejected": -1.2012661695480347, + "logps/chosen": -57.371673583984375, + "logps/rejected": -103.62713623046875, + "loss": 0.2171, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.020657628774642944, + "rewards/margins": 4.762262344360352, + "rewards/rejected": -4.741604804992676, + "step": 368 + }, + { + "epoch": 0.56, + "learning_rate": 9.043524789336422e-07, + "logits/chosen": -1.0432727336883545, + "logits/rejected": -1.1021490097045898, + "logps/chosen": -55.0340690612793, + "logps/rejected": -96.28125, + "loss": 0.1971, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.2624248266220093, + "rewards/margins": 4.110570907592773, + "rewards/rejected": -4.372995853424072, + "step": 369 + }, + { + "epoch": 0.56, + "learning_rate": 9.035706804663818e-07, + "logits/chosen": -1.3030118942260742, + "logits/rejected": -1.3033980131149292, + "logps/chosen": -53.38115310668945, + "logps/rejected": -101.26620483398438, + "loss": 0.2095, + "rewards/accuracies": 0.8125, + "rewards/chosen": 0.3574654459953308, + "rewards/margins": 3.6292953491210938, + "rewards/rejected": -3.271829605102539, + "step": 370 + }, + { + "epoch": 0.56, + "learning_rate": 9.027860407082706e-07, + "logits/chosen": -1.0482661724090576, + "logits/rejected": -1.0609327554702759, + "logps/chosen": -43.941986083984375, + "logps/rejected": -121.45335388183594, + "loss": 0.1873, + "rewards/accuracies": 0.8125, + "rewards/chosen": 0.4450610280036926, + "rewards/margins": 6.240961074829102, + "rewards/rejected": -5.795900344848633, + "step": 371 + }, + { + "epoch": 0.57, + "learning_rate": 9.019985651834703e-07, + "logits/chosen": -1.0152562856674194, + "logits/rejected": -1.011940360069275, + "logps/chosen": -67.2203140258789, + "logps/rejected": -114.62171936035156, + "loss": 0.1769, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.6607855558395386, + "rewards/margins": 4.153834819793701, + "rewards/rejected": -4.814620018005371, + "step": 372 + }, + { + "epoch": 0.57, + "learning_rate": 9.012082594361075e-07, + "logits/chosen": -0.9807397127151489, + "logits/rejected": -1.0203466415405273, + "logps/chosen": -56.298583984375, + "logps/rejected": -98.74402618408203, + "loss": 0.171, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.09569351375102997, + "rewards/margins": 4.102729797363281, + "rewards/rejected": -4.0070366859436035, + "step": 373 + }, + { + "epoch": 0.57, + "learning_rate": 9.004151290302349e-07, + "logits/chosen": -1.163325548171997, + "logits/rejected": -1.0914274454116821, + "logps/chosen": -68.55766296386719, + "logps/rejected": -156.38412475585938, + "loss": 0.1497, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.25641486048698425, + "rewards/margins": 7.406113624572754, + "rewards/rejected": -7.6625285148620605, + "step": 374 + }, + { + "epoch": 0.57, + "learning_rate": 8.996191795497919e-07, + "logits/chosen": -1.2768843173980713, + "logits/rejected": -1.240263819694519, + "logps/chosen": -68.02412414550781, + "logps/rejected": -153.4176788330078, + "loss": 0.1802, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.3844533860683441, + "rewards/margins": 7.109435558319092, + "rewards/rejected": -7.493888854980469, + "step": 375 + }, + { + "epoch": 0.57, + "learning_rate": 8.988204165985649e-07, + "logits/chosen": -1.1378958225250244, + "logits/rejected": -1.1830037832260132, + "logps/chosen": -70.71237182617188, + "logps/rejected": -103.13880920410156, + "loss": 0.2277, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.7702474594116211, + "rewards/margins": 3.580688238143921, + "rewards/rejected": -4.350935459136963, + "step": 376 + }, + { + "epoch": 0.57, + "learning_rate": 8.980188458001485e-07, + "logits/chosen": -1.1306016445159912, + "logits/rejected": -1.0990478992462158, + "logps/chosen": -55.12686538696289, + "logps/rejected": -116.69119262695312, + "loss": 0.1967, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.9413877129554749, + "rewards/margins": 5.052735328674316, + "rewards/rejected": -5.9941229820251465, + "step": 377 + }, + { + "epoch": 0.57, + "learning_rate": 8.972144727979055e-07, + "logits/chosen": -1.2199490070343018, + "logits/rejected": -1.1264431476593018, + "logps/chosen": -72.9341049194336, + "logps/rejected": -156.02508544921875, + "loss": 0.2169, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.3027689456939697, + "rewards/margins": 6.640481948852539, + "rewards/rejected": -7.943251609802246, + "step": 378 + }, + { + "epoch": 0.58, + "learning_rate": 8.964073032549274e-07, + "logits/chosen": -1.0452485084533691, + "logits/rejected": -0.9996328949928284, + "logps/chosen": -82.88663482666016, + "logps/rejected": -162.11898803710938, + "loss": 0.267, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.3536241054534912, + "rewards/margins": 6.493758678436279, + "rewards/rejected": -7.847382545471191, + "step": 379 + }, + { + "epoch": 0.58, + "learning_rate": 8.955973428539942e-07, + "logits/chosen": -0.8747669458389282, + "logits/rejected": -0.786665678024292, + "logps/chosen": -77.19548797607422, + "logps/rejected": -172.11715698242188, + "loss": 0.1487, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.6060906648635864, + "rewards/margins": 6.885627269744873, + "rewards/rejected": -8.491718292236328, + "step": 380 + }, + { + "epoch": 0.58, + "learning_rate": 8.947845972975347e-07, + "logits/chosen": -1.1507608890533447, + "logits/rejected": -1.093747615814209, + "logps/chosen": -45.77164077758789, + "logps/rejected": -102.840576171875, + "loss": 0.1672, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.4861528277397156, + "rewards/margins": 4.780185699462891, + "rewards/rejected": -4.294033050537109, + "step": 381 + }, + { + "epoch": 0.58, + "learning_rate": 8.939690723075864e-07, + "logits/chosen": -0.8670116066932678, + "logits/rejected": -0.8109041452407837, + "logps/chosen": -90.49297332763672, + "logps/rejected": -167.0618896484375, + "loss": 0.1867, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.614911437034607, + "rewards/margins": 5.28792142868042, + "rewards/rejected": -6.902832508087158, + "step": 382 + }, + { + "epoch": 0.58, + "learning_rate": 8.931507736257548e-07, + "logits/chosen": -1.2159194946289062, + "logits/rejected": -1.1381936073303223, + "logps/chosen": -58.98343276977539, + "logps/rejected": -132.68360900878906, + "loss": 0.2547, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.4426393508911133, + "rewards/margins": 6.3477582931518555, + "rewards/rejected": -6.790397644042969, + "step": 383 + }, + { + "epoch": 0.58, + "learning_rate": 8.923297070131737e-07, + "logits/chosen": -1.3936126232147217, + "logits/rejected": -1.4558279514312744, + "logps/chosen": -61.366458892822266, + "logps/rejected": -142.0255584716797, + "loss": 0.1539, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.5890704989433289, + "rewards/margins": 6.175658226013184, + "rewards/rejected": -6.764728546142578, + "step": 384 + }, + { + "epoch": 0.58, + "learning_rate": 8.915058782504634e-07, + "logits/chosen": -0.9364544749259949, + "logits/rejected": -0.9187269806861877, + "logps/chosen": -51.528404235839844, + "logps/rejected": -127.87124633789062, + "loss": 0.2382, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.18791763484477997, + "rewards/margins": 5.724534034729004, + "rewards/rejected": -5.536615371704102, + "step": 385 + }, + { + "epoch": 0.59, + "learning_rate": 8.906792931376914e-07, + "logits/chosen": -0.9947543740272522, + "logits/rejected": -1.000620722770691, + "logps/chosen": -73.4270248413086, + "logps/rejected": -113.23756408691406, + "loss": 0.1668, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.42094066739082336, + "rewards/margins": 4.028453350067139, + "rewards/rejected": -4.449394226074219, + "step": 386 + }, + { + "epoch": 0.59, + "learning_rate": 8.898499574943309e-07, + "logits/chosen": -1.025214672088623, + "logits/rejected": -0.9233087301254272, + "logps/chosen": -49.42487335205078, + "logps/rejected": -108.94894409179688, + "loss": 0.1782, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.6033812761306763, + "rewards/margins": 5.041341304779053, + "rewards/rejected": -4.437960147857666, + "step": 387 + }, + { + "epoch": 0.59, + "learning_rate": 8.890178771592197e-07, + "logits/chosen": -1.1705193519592285, + "logits/rejected": -1.122748851776123, + "logps/chosen": -36.10163879394531, + "logps/rejected": -83.32039642333984, + "loss": 0.1825, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.7399885654449463, + "rewards/margins": 3.629817247390747, + "rewards/rejected": -2.889828681945801, + "step": 388 + }, + { + "epoch": 0.59, + "learning_rate": 8.881830579905194e-07, + "logits/chosen": -1.4119915962219238, + "logits/rejected": -1.378150224685669, + "logps/chosen": -82.31924438476562, + "logps/rejected": -147.86798095703125, + "loss": 0.1762, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.6388192176818848, + "rewards/margins": 6.267736434936523, + "rewards/rejected": -6.906556129455566, + "step": 389 + }, + { + "epoch": 0.59, + "learning_rate": 8.87345505865674e-07, + "logits/chosen": -1.1888000965118408, + "logits/rejected": -1.082751750946045, + "logps/chosen": -69.81549835205078, + "logps/rejected": -129.64540100097656, + "loss": 0.2374, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.8034672737121582, + "rewards/margins": 4.446656703948975, + "rewards/rejected": -5.250123977661133, + "step": 390 + }, + { + "epoch": 0.59, + "learning_rate": 8.865052266813685e-07, + "logits/chosen": -1.2450262308120728, + "logits/rejected": -1.1116446256637573, + "logps/chosen": -82.40213775634766, + "logps/rejected": -175.6051788330078, + "loss": 0.2482, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.3452680110931396, + "rewards/margins": 6.234772682189941, + "rewards/rejected": -7.580041408538818, + "step": 391 + }, + { + "epoch": 0.6, + "learning_rate": 8.856622263534874e-07, + "logits/chosen": -1.3854156732559204, + "logits/rejected": -1.3954724073410034, + "logps/chosen": -50.992088317871094, + "logps/rejected": -120.94493865966797, + "loss": 0.18, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.28074389696121216, + "rewards/margins": 6.036993980407715, + "rewards/rejected": -5.75624942779541, + "step": 392 + }, + { + "epoch": 0.6, + "learning_rate": 8.848165108170731e-07, + "logits/chosen": -1.0285769701004028, + "logits/rejected": -0.9848490357398987, + "logps/chosen": -77.45350646972656, + "logps/rejected": -174.41880798339844, + "loss": 0.1501, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.0908023938536644, + "rewards/margins": 8.113433837890625, + "rewards/rejected": -8.02263069152832, + "step": 393 + }, + { + "epoch": 0.6, + "learning_rate": 8.839680860262844e-07, + "logits/chosen": -0.9170817136764526, + "logits/rejected": -0.8319286108016968, + "logps/chosen": -41.26301193237305, + "logps/rejected": -123.33838653564453, + "loss": 0.1444, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.7206428647041321, + "rewards/margins": 5.508768081665039, + "rewards/rejected": -4.788125514984131, + "step": 394 + }, + { + "epoch": 0.6, + "learning_rate": 8.831169579543538e-07, + "logits/chosen": -1.1199142932891846, + "logits/rejected": -1.2598261833190918, + "logps/chosen": -62.73654556274414, + "logps/rejected": -113.8770523071289, + "loss": 0.208, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.5435269474983215, + "rewards/margins": 4.305499076843262, + "rewards/rejected": -4.849026679992676, + "step": 395 + }, + { + "epoch": 0.6, + "learning_rate": 8.822631325935463e-07, + "logits/chosen": -1.354875087738037, + "logits/rejected": -1.3106560707092285, + "logps/chosen": -56.36314010620117, + "logps/rejected": -105.28909301757812, + "loss": 0.1578, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.7352412939071655, + "rewards/margins": 4.004746437072754, + "rewards/rejected": -3.269505262374878, + "step": 396 + }, + { + "epoch": 0.6, + "learning_rate": 8.814066159551165e-07, + "logits/chosen": -1.1495639085769653, + "logits/rejected": -1.042298436164856, + "logps/chosen": -66.9743881225586, + "logps/rejected": -137.67288208007812, + "loss": 0.2452, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.3965058922767639, + "rewards/margins": 5.131819725036621, + "rewards/rejected": -5.52832555770874, + "step": 397 + }, + { + "epoch": 0.6, + "learning_rate": 8.805474140692669e-07, + "logits/chosen": -1.2664752006530762, + "logits/rejected": -1.214312195777893, + "logps/chosen": -73.468017578125, + "logps/rejected": -154.4588165283203, + "loss": 0.1813, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.47094786167144775, + "rewards/margins": 6.118612766265869, + "rewards/rejected": -6.589560508728027, + "step": 398 + }, + { + "epoch": 0.61, + "learning_rate": 8.796855329851052e-07, + "logits/chosen": -1.1437548398971558, + "logits/rejected": -1.1186609268188477, + "logps/chosen": -52.121925354003906, + "logps/rejected": -108.3409652709961, + "loss": 0.1657, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.2030365765094757, + "rewards/margins": 4.836932182312012, + "rewards/rejected": -4.633895397186279, + "step": 399 + }, + { + "epoch": 0.61, + "learning_rate": 8.788209787706014e-07, + "logits/chosen": -1.1624184846878052, + "logits/rejected": -1.1892285346984863, + "logps/chosen": -50.03237533569336, + "logps/rejected": -90.02861022949219, + "loss": 0.2179, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.05295214056968689, + "rewards/margins": 4.0794854164123535, + "rewards/rejected": -4.132437705993652, + "step": 400 + }, + { + "epoch": 0.61, + "learning_rate": 8.779537575125455e-07, + "logits/chosen": -1.2393620014190674, + "logits/rejected": -1.1940300464630127, + "logps/chosen": -49.26935958862305, + "logps/rejected": -111.75259399414062, + "loss": 0.2443, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.5827302932739258, + "rewards/margins": 4.7527031898498535, + "rewards/rejected": -4.1699724197387695, + "step": 401 + }, + { + "epoch": 0.61, + "learning_rate": 8.770838753165044e-07, + "logits/chosen": -0.9424704313278198, + "logits/rejected": -0.8532572984695435, + "logps/chosen": -51.71316146850586, + "logps/rejected": -123.20469665527344, + "loss": 0.196, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.06543193012475967, + "rewards/margins": 4.748517036437988, + "rewards/rejected": -4.6830854415893555, + "step": 402 + }, + { + "epoch": 0.61, + "learning_rate": 8.762113383067793e-07, + "logits/chosen": -1.0254884958267212, + "logits/rejected": -0.8957661390304565, + "logps/chosen": -66.71129608154297, + "logps/rejected": -159.77406311035156, + "loss": 0.2056, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.3826755881309509, + "rewards/margins": 6.118356704711914, + "rewards/rejected": -6.501031875610352, + "step": 403 + }, + { + "epoch": 0.61, + "learning_rate": 8.753361526263621e-07, + "logits/chosen": -1.2008506059646606, + "logits/rejected": -1.0892599821090698, + "logps/chosen": -67.92991638183594, + "logps/rejected": -161.24533081054688, + "loss": 0.1915, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.2895081639289856, + "rewards/margins": 5.8394365310668945, + "rewards/rejected": -6.128944396972656, + "step": 404 + }, + { + "epoch": 0.62, + "learning_rate": 8.744583244368923e-07, + "logits/chosen": -1.0438123941421509, + "logits/rejected": -1.0480939149856567, + "logps/chosen": -70.23876953125, + "logps/rejected": -138.24124145507812, + "loss": 0.2158, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.04963761568069458, + "rewards/margins": 4.248196125030518, + "rewards/rejected": -4.297833442687988, + "step": 405 + }, + { + "epoch": 0.62, + "learning_rate": 8.735778599186136e-07, + "logits/chosen": -1.226418137550354, + "logits/rejected": -1.176148533821106, + "logps/chosen": -50.929847717285156, + "logps/rejected": -96.35321807861328, + "loss": 0.1878, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.22736504673957825, + "rewards/margins": 3.7616052627563477, + "rewards/rejected": -3.5342397689819336, + "step": 406 + }, + { + "epoch": 0.62, + "learning_rate": 8.726947652703307e-07, + "logits/chosen": -1.170417308807373, + "logits/rejected": -1.1636468172073364, + "logps/chosen": -57.47322082519531, + "logps/rejected": -143.91566467285156, + "loss": 0.2255, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.058366671204566956, + "rewards/margins": 5.934195518493652, + "rewards/rejected": -5.875828742980957, + "step": 407 + }, + { + "epoch": 0.62, + "learning_rate": 8.718090467093653e-07, + "logits/chosen": -1.2647299766540527, + "logits/rejected": -1.3104499578475952, + "logps/chosen": -68.82608032226562, + "logps/rejected": -103.34474182128906, + "loss": 0.1692, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.005170680582523346, + "rewards/margins": 3.650045394897461, + "rewards/rejected": -3.6448748111724854, + "step": 408 + }, + { + "epoch": 0.62, + "learning_rate": 8.709207104715124e-07, + "logits/chosen": -1.369813084602356, + "logits/rejected": -1.238029956817627, + "logps/chosen": -51.8563346862793, + "logps/rejected": -129.11083984375, + "loss": 0.2409, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.6820074319839478, + "rewards/margins": 4.948735237121582, + "rewards/rejected": -4.266727924346924, + "step": 409 + }, + { + "epoch": 0.62, + "learning_rate": 8.700297628109964e-07, + "logits/chosen": -1.0370895862579346, + "logits/rejected": -0.9580932259559631, + "logps/chosen": -70.1031723022461, + "logps/rejected": -165.79930114746094, + "loss": 0.2278, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.26600658893585205, + "rewards/margins": 6.668997764587402, + "rewards/rejected": -6.40299129486084, + "step": 410 + }, + { + "epoch": 0.62, + "learning_rate": 8.691362100004273e-07, + "logits/chosen": -1.1846193075180054, + "logits/rejected": -1.2501457929611206, + "logps/chosen": -75.08912658691406, + "logps/rejected": -120.98992919921875, + "loss": 0.2238, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.41689085960388184, + "rewards/margins": 4.603246212005615, + "rewards/rejected": -5.020136833190918, + "step": 411 + }, + { + "epoch": 0.63, + "learning_rate": 8.68240058330756e-07, + "logits/chosen": -1.1766867637634277, + "logits/rejected": -1.1819841861724854, + "logps/chosen": -40.93233108520508, + "logps/rejected": -90.56710052490234, + "loss": 0.2187, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.7101311683654785, + "rewards/margins": 4.047136306762695, + "rewards/rejected": -3.337005376815796, + "step": 412 + }, + { + "epoch": 0.63, + "learning_rate": 8.673413141112309e-07, + "logits/chosen": -1.2025368213653564, + "logits/rejected": -1.137385606765747, + "logps/chosen": -53.99886703491211, + "logps/rejected": -119.89103698730469, + "loss": 0.1404, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.3699154257774353, + "rewards/margins": 5.138401985168457, + "rewards/rejected": -4.768486499786377, + "step": 413 + }, + { + "epoch": 0.63, + "learning_rate": 8.664399836693525e-07, + "logits/chosen": -1.2454607486724854, + "logits/rejected": -1.167482852935791, + "logps/chosen": -60.050811767578125, + "logps/rejected": -140.74850463867188, + "loss": 0.1651, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.5211348533630371, + "rewards/margins": 5.840010643005371, + "rewards/rejected": -6.361145973205566, + "step": 414 + }, + { + "epoch": 0.63, + "learning_rate": 8.655360733508292e-07, + "logits/chosen": -1.2281296253204346, + "logits/rejected": -1.205501675605774, + "logps/chosen": -54.05847930908203, + "logps/rejected": -114.76741027832031, + "loss": 0.2068, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.048486582934856415, + "rewards/margins": 4.931014060974121, + "rewards/rejected": -4.882527828216553, + "step": 415 + }, + { + "epoch": 0.63, + "learning_rate": 8.646295895195333e-07, + "logits/chosen": -1.1683878898620605, + "logits/rejected": -1.1041896343231201, + "logps/chosen": -48.70958709716797, + "logps/rejected": -115.05414581298828, + "loss": 0.1353, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.47555655241012573, + "rewards/margins": 5.370617389678955, + "rewards/rejected": -4.895061016082764, + "step": 416 + }, + { + "epoch": 0.63, + "learning_rate": 8.637205385574547e-07, + "logits/chosen": -1.1590019464492798, + "logits/rejected": -1.2345454692840576, + "logps/chosen": -70.25973510742188, + "logps/rejected": -153.3055877685547, + "loss": 0.1685, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.06854680180549622, + "rewards/margins": 7.001974582672119, + "rewards/rejected": -6.933426856994629, + "step": 417 + }, + { + "epoch": 0.64, + "learning_rate": 8.628089268646579e-07, + "logits/chosen": -0.9455069303512573, + "logits/rejected": -0.9382145404815674, + "logps/chosen": -60.832069396972656, + "logps/rejected": -111.65571594238281, + "loss": 0.1859, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.29948708415031433, + "rewards/margins": 3.8152897357940674, + "rewards/rejected": -4.114776611328125, + "step": 418 + }, + { + "epoch": 0.64, + "learning_rate": 8.618947608592351e-07, + "logits/chosen": -1.022441029548645, + "logits/rejected": -1.0866788625717163, + "logps/chosen": -69.27745056152344, + "logps/rejected": -123.41693115234375, + "loss": 0.112, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.3073914349079132, + "rewards/margins": 5.595177173614502, + "rewards/rejected": -5.902567386627197, + "step": 419 + }, + { + "epoch": 0.64, + "learning_rate": 8.609780469772621e-07, + "logits/chosen": -1.0906612873077393, + "logits/rejected": -1.0917662382125854, + "logps/chosen": -84.9109115600586, + "logps/rejected": -181.5916748046875, + "loss": 0.2012, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.4173294007778168, + "rewards/margins": 8.144667625427246, + "rewards/rejected": -8.561997413635254, + "step": 420 + }, + { + "epoch": 0.64, + "learning_rate": 8.600587916727532e-07, + "logits/chosen": -0.818186342716217, + "logits/rejected": -0.8237495422363281, + "logps/chosen": -46.085514068603516, + "logps/rejected": -100.67369842529297, + "loss": 0.1745, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.13958895206451416, + "rewards/margins": 4.293861389160156, + "rewards/rejected": -4.15427303314209, + "step": 421 + }, + { + "epoch": 0.64, + "learning_rate": 8.591370014176144e-07, + "logits/chosen": -1.2785382270812988, + "logits/rejected": -1.270959496498108, + "logps/chosen": -81.02368927001953, + "logps/rejected": -142.10545349121094, + "loss": 0.1524, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.0255403518676758, + "rewards/margins": 5.374872207641602, + "rewards/rejected": -6.4004130363464355, + "step": 422 + }, + { + "epoch": 0.64, + "learning_rate": 8.582126827015992e-07, + "logits/chosen": -1.2582919597625732, + "logits/rejected": -1.2366271018981934, + "logps/chosen": -90.80106353759766, + "logps/rejected": -185.97265625, + "loss": 0.1604, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.2223063707351685, + "rewards/margins": 8.304044723510742, + "rewards/rejected": -9.526350975036621, + "step": 423 + }, + { + "epoch": 0.64, + "learning_rate": 8.572858420322627e-07, + "logits/chosen": -0.9271082878112793, + "logits/rejected": -0.9362425804138184, + "logps/chosen": -77.71623992919922, + "logps/rejected": -125.13803100585938, + "loss": 0.2093, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.0138859748840332, + "rewards/margins": 4.141689777374268, + "rewards/rejected": -5.155575752258301, + "step": 424 + }, + { + "epoch": 0.65, + "learning_rate": 8.563564859349147e-07, + "logits/chosen": -1.1701502799987793, + "logits/rejected": -1.0111591815948486, + "logps/chosen": -71.44900512695312, + "logps/rejected": -142.65487670898438, + "loss": 0.1933, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.7668020725250244, + "rewards/margins": 5.6033477783203125, + "rewards/rejected": -6.3701491355896, + "step": 425 + }, + { + "epoch": 0.65, + "learning_rate": 8.554246209525755e-07, + "logits/chosen": -1.1665081977844238, + "logits/rejected": -1.1051981449127197, + "logps/chosen": -52.993370056152344, + "logps/rejected": -123.57511138916016, + "loss": 0.2014, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.2413010597229004, + "rewards/margins": 5.42091178894043, + "rewards/rejected": -5.662213325500488, + "step": 426 + }, + { + "epoch": 0.65, + "learning_rate": 8.544902536459283e-07, + "logits/chosen": -1.0885193347930908, + "logits/rejected": -1.0061079263687134, + "logps/chosen": -78.6636962890625, + "logps/rejected": -166.54043579101562, + "loss": 0.1941, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.6128672361373901, + "rewards/margins": 6.642463207244873, + "rewards/rejected": -8.255331039428711, + "step": 427 + }, + { + "epoch": 0.65, + "learning_rate": 8.535533905932737e-07, + "logits/chosen": -1.0796353816986084, + "logits/rejected": -1.1003433465957642, + "logps/chosen": -59.647865295410156, + "logps/rejected": -116.34637451171875, + "loss": 0.1723, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.1189768984913826, + "rewards/margins": 5.141149044036865, + "rewards/rejected": -5.260126113891602, + "step": 428 + }, + { + "epoch": 0.65, + "learning_rate": 8.526140383904836e-07, + "logits/chosen": -0.939657986164093, + "logits/rejected": -0.9173675775527954, + "logps/chosen": -66.11197662353516, + "logps/rejected": -147.4273681640625, + "loss": 0.1459, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.6731550097465515, + "rewards/margins": 6.669175148010254, + "rewards/rejected": -7.342329978942871, + "step": 429 + }, + { + "epoch": 0.65, + "learning_rate": 8.516722036509538e-07, + "logits/chosen": -1.3927438259124756, + "logits/rejected": -1.3029999732971191, + "logps/chosen": -64.37356567382812, + "logps/rejected": -149.1137237548828, + "loss": 0.1832, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.45377394556999207, + "rewards/margins": 6.534273147583008, + "rewards/rejected": -6.988047122955322, + "step": 430 + }, + { + "epoch": 0.65, + "learning_rate": 8.50727893005559e-07, + "logits/chosen": -1.2839081287384033, + "logits/rejected": -1.3670390844345093, + "logps/chosen": -75.1611328125, + "logps/rejected": -152.00440979003906, + "loss": 0.1642, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.9996129274368286, + "rewards/margins": 6.434969425201416, + "rewards/rejected": -7.434582710266113, + "step": 431 + }, + { + "epoch": 0.66, + "learning_rate": 8.497811131026045e-07, + "logits/chosen": -1.0323083400726318, + "logits/rejected": -0.9453625679016113, + "logps/chosen": -67.27543640136719, + "logps/rejected": -137.97413635253906, + "loss": 0.1446, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.5392396450042725, + "rewards/margins": 5.336440086364746, + "rewards/rejected": -5.875679969787598, + "step": 432 + }, + { + "epoch": 0.66, + "learning_rate": 8.488318706077805e-07, + "logits/chosen": -1.1992932558059692, + "logits/rejected": -1.1836185455322266, + "logps/chosen": -57.8773307800293, + "logps/rejected": -136.941162109375, + "loss": 0.219, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.13526064157485962, + "rewards/margins": 6.538956642150879, + "rewards/rejected": -6.674217700958252, + "step": 433 + }, + { + "epoch": 0.66, + "learning_rate": 8.478801722041146e-07, + "logits/chosen": -1.2053327560424805, + "logits/rejected": -1.054445743560791, + "logps/chosen": -64.59292602539062, + "logps/rejected": -153.5313262939453, + "loss": 0.1623, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.6657881140708923, + "rewards/margins": 6.392370700836182, + "rewards/rejected": -7.058159351348877, + "step": 434 + }, + { + "epoch": 0.66, + "learning_rate": 8.46926024591925e-07, + "logits/chosen": -0.9527780413627625, + "logits/rejected": -0.9377985596656799, + "logps/chosen": -68.6094970703125, + "logps/rejected": -112.81304931640625, + "loss": 0.1744, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.9481337070465088, + "rewards/margins": 4.392820358276367, + "rewards/rejected": -5.340954303741455, + "step": 435 + }, + { + "epoch": 0.66, + "learning_rate": 8.459694344887731e-07, + "logits/chosen": -0.9847865104675293, + "logits/rejected": -1.007370948791504, + "logps/chosen": -53.81441116333008, + "logps/rejected": -91.65478515625, + "loss": 0.1789, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.1929657459259033, + "rewards/margins": 3.4950058460235596, + "rewards/rejected": -4.687971591949463, + "step": 436 + }, + { + "epoch": 0.66, + "learning_rate": 8.450104086294165e-07, + "logits/chosen": -1.2699943780899048, + "logits/rejected": -1.2473492622375488, + "logps/chosen": -82.6616439819336, + "logps/rejected": -169.1042938232422, + "loss": 0.1898, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.1959168910980225, + "rewards/margins": 6.592803955078125, + "rewards/rejected": -7.788720607757568, + "step": 437 + }, + { + "epoch": 0.67, + "learning_rate": 8.440489537657618e-07, + "logits/chosen": -1.339113473892212, + "logits/rejected": -1.269339680671692, + "logps/chosen": -55.6357307434082, + "logps/rejected": -125.44326782226562, + "loss": 0.2618, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.15911231935024261, + "rewards/margins": 5.041744232177734, + "rewards/rejected": -5.200855731964111, + "step": 438 + }, + { + "epoch": 0.67, + "learning_rate": 8.430850766668161e-07, + "logits/chosen": -1.1586527824401855, + "logits/rejected": -1.1263474225997925, + "logps/chosen": -59.57755661010742, + "logps/rejected": -128.5335693359375, + "loss": 0.2132, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.4526219069957733, + "rewards/margins": 4.85505485534668, + "rewards/rejected": -5.307676315307617, + "step": 439 + }, + { + "epoch": 0.67, + "learning_rate": 8.421187841186401e-07, + "logits/chosen": -1.1829062700271606, + "logits/rejected": -1.114036202430725, + "logps/chosen": -77.67008209228516, + "logps/rejected": -159.51492309570312, + "loss": 0.1727, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.0863698720932007, + "rewards/margins": 6.396143436431885, + "rewards/rejected": -7.482513427734375, + "step": 440 + }, + { + "epoch": 0.67, + "learning_rate": 8.411500829243005e-07, + "logits/chosen": -1.3695244789123535, + "logits/rejected": -1.2936193943023682, + "logps/chosen": -57.488224029541016, + "logps/rejected": -125.73988342285156, + "loss": 0.1468, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.4893732964992523, + "rewards/margins": 4.295994281768799, + "rewards/rejected": -4.785367012023926, + "step": 441 + }, + { + "epoch": 0.67, + "learning_rate": 8.401789799038216e-07, + "logits/chosen": -1.1487072706222534, + "logits/rejected": -1.1367806196212769, + "logps/chosen": -69.29662322998047, + "logps/rejected": -139.33428955078125, + "loss": 0.2547, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.7461766004562378, + "rewards/margins": 5.8961663246154785, + "rewards/rejected": -6.642343044281006, + "step": 442 + }, + { + "epoch": 0.67, + "learning_rate": 8.392054818941374e-07, + "logits/chosen": -1.072195291519165, + "logits/rejected": -1.1083810329437256, + "logps/chosen": -51.24053192138672, + "logps/rejected": -133.1757354736328, + "loss": 0.2195, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.6234368085861206, + "rewards/margins": 6.912321090698242, + "rewards/rejected": -6.288885116577148, + "step": 443 + }, + { + "epoch": 0.67, + "learning_rate": 8.382295957490435e-07, + "logits/chosen": -1.0317915678024292, + "logits/rejected": -1.020350456237793, + "logps/chosen": -54.91352844238281, + "logps/rejected": -111.52494049072266, + "loss": 0.184, + "rewards/accuracies": 0.8125, + "rewards/chosen": 0.05642195791006088, + "rewards/margins": 5.561900615692139, + "rewards/rejected": -5.505478382110596, + "step": 444 + }, + { + "epoch": 0.68, + "learning_rate": 8.372513283391489e-07, + "logits/chosen": -1.2246626615524292, + "logits/rejected": -1.1847219467163086, + "logps/chosen": -67.77139282226562, + "logps/rejected": -144.07675170898438, + "loss": 0.1231, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.8761771321296692, + "rewards/margins": 5.497477054595947, + "rewards/rejected": -6.373654365539551, + "step": 445 + }, + { + "epoch": 0.68, + "learning_rate": 8.36270686551828e-07, + "logits/chosen": -1.3635679483413696, + "logits/rejected": -1.4009004831314087, + "logps/chosen": -57.39094924926758, + "logps/rejected": -99.71837615966797, + "loss": 0.1374, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.02806789055466652, + "rewards/margins": 4.0962934494018555, + "rewards/rejected": -4.124361515045166, + "step": 446 + }, + { + "epoch": 0.68, + "learning_rate": 8.35287677291171e-07, + "logits/chosen": -1.0316144227981567, + "logits/rejected": -0.9815775156021118, + "logps/chosen": -49.208290100097656, + "logps/rejected": -99.20479583740234, + "loss": 0.2081, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.08192768692970276, + "rewards/margins": 3.8505711555480957, + "rewards/rejected": -3.9324989318847656, + "step": 447 + }, + { + "epoch": 0.68, + "learning_rate": 8.343023074779368e-07, + "logits/chosen": -0.8122367858886719, + "logits/rejected": -0.8544483780860901, + "logps/chosen": -42.28432846069336, + "logps/rejected": -93.08744049072266, + "loss": 0.1256, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.23971232771873474, + "rewards/margins": 4.2626051902771, + "rewards/rejected": -4.022892475128174, + "step": 448 + }, + { + "epoch": 0.68, + "learning_rate": 8.333145840495027e-07, + "logits/chosen": -1.1597957611083984, + "logits/rejected": -1.127702236175537, + "logps/chosen": -57.63410568237305, + "logps/rejected": -108.86802673339844, + "loss": 0.1385, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.15871790051460266, + "rewards/margins": 4.300457000732422, + "rewards/rejected": -4.141739845275879, + "step": 449 + }, + { + "epoch": 0.68, + "learning_rate": 8.32324513959817e-07, + "logits/chosen": -1.239582896232605, + "logits/rejected": -1.27329683303833, + "logps/chosen": -65.73753356933594, + "logps/rejected": -123.59443664550781, + "loss": 0.1486, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.25332725048065186, + "rewards/margins": 4.592188835144043, + "rewards/rejected": -4.845516681671143, + "step": 450 + }, + { + "epoch": 0.69, + "learning_rate": 8.313321041793491e-07, + "logits/chosen": -1.1276596784591675, + "logits/rejected": -1.1869280338287354, + "logps/chosen": -61.36394119262695, + "logps/rejected": -142.14089965820312, + "loss": 0.1277, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.573635995388031, + "rewards/margins": 6.9199676513671875, + "rewards/rejected": -6.346331596374512, + "step": 451 + }, + { + "epoch": 0.69, + "learning_rate": 8.303373616950406e-07, + "logits/chosen": -0.9326416254043579, + "logits/rejected": -0.9587483406066895, + "logps/chosen": -43.44063949584961, + "logps/rejected": -98.64341735839844, + "loss": 0.2942, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.524234414100647, + "rewards/margins": 4.821613311767578, + "rewards/rejected": -4.297379016876221, + "step": 452 + }, + { + "epoch": 0.69, + "learning_rate": 8.293402935102566e-07, + "logits/chosen": -1.2515349388122559, + "logits/rejected": -1.300333023071289, + "logps/chosen": -76.35446166992188, + "logps/rejected": -148.16094970703125, + "loss": 0.1673, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.3025237023830414, + "rewards/margins": 6.630747318267822, + "rewards/rejected": -6.933270454406738, + "step": 453 + }, + { + "epoch": 0.69, + "learning_rate": 8.283409066447355e-07, + "logits/chosen": -1.1542006731033325, + "logits/rejected": -1.1590346097946167, + "logps/chosen": -55.921749114990234, + "logps/rejected": -125.4809799194336, + "loss": 0.1627, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.44540640711784363, + "rewards/margins": 5.576838493347168, + "rewards/rejected": -5.131432056427002, + "step": 454 + }, + { + "epoch": 0.69, + "learning_rate": 8.273392081345404e-07, + "logits/chosen": -1.192995548248291, + "logits/rejected": -1.053298830986023, + "logps/chosen": -66.12108612060547, + "logps/rejected": -146.68898010253906, + "loss": 0.1636, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.0854855328798294, + "rewards/margins": 5.814007759094238, + "rewards/rejected": -5.728522300720215, + "step": 455 + }, + { + "epoch": 0.69, + "learning_rate": 8.263352050320094e-07, + "logits/chosen": -1.0546703338623047, + "logits/rejected": -1.040149450302124, + "logps/chosen": -54.151493072509766, + "logps/rejected": -125.35000610351562, + "loss": 0.1212, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.0205656997859478, + "rewards/margins": 5.513425827026367, + "rewards/rejected": -5.53399133682251, + "step": 456 + }, + { + "epoch": 0.69, + "learning_rate": 8.253289044057053e-07, + "logits/chosen": -1.203826904296875, + "logits/rejected": -1.198434829711914, + "logps/chosen": -55.11236572265625, + "logps/rejected": -112.63180541992188, + "loss": 0.1271, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.22854329645633698, + "rewards/margins": 4.5432257652282715, + "rewards/rejected": -4.771769046783447, + "step": 457 + }, + { + "epoch": 0.7, + "learning_rate": 8.243203133403671e-07, + "logits/chosen": -1.0231239795684814, + "logits/rejected": -0.9769048094749451, + "logps/chosen": -47.63315200805664, + "logps/rejected": -90.92638397216797, + "loss": 0.1778, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.21240568161010742, + "rewards/margins": 4.013316631317139, + "rewards/rejected": -3.800910472869873, + "step": 458 + }, + { + "epoch": 0.7, + "learning_rate": 8.233094389368584e-07, + "logits/chosen": -1.0631383657455444, + "logits/rejected": -1.0314054489135742, + "logps/chosen": -59.08259582519531, + "logps/rejected": -123.09644317626953, + "loss": 0.1874, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.03381906449794769, + "rewards/margins": 5.111746788024902, + "rewards/rejected": -5.077927589416504, + "step": 459 + }, + { + "epoch": 0.7, + "learning_rate": 8.222962883121195e-07, + "logits/chosen": -1.1790649890899658, + "logits/rejected": -1.1812490224838257, + "logps/chosen": -72.05509948730469, + "logps/rejected": -149.14263916015625, + "loss": 0.1478, + "rewards/accuracies": 0.8125, + "rewards/chosen": 0.024528905749320984, + "rewards/margins": 6.314554214477539, + "rewards/rejected": -6.290025234222412, + "step": 460 + }, + { + "epoch": 0.7, + "learning_rate": 8.21280868599115e-07, + "logits/chosen": -1.2503714561462402, + "logits/rejected": -1.3049639463424683, + "logps/chosen": -55.94298553466797, + "logps/rejected": -105.9774169921875, + "loss": 0.1861, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.1556202620267868, + "rewards/margins": 4.34453010559082, + "rewards/rejected": -4.188909530639648, + "step": 461 + }, + { + "epoch": 0.7, + "learning_rate": 8.202631869467858e-07, + "logits/chosen": -1.1742231845855713, + "logits/rejected": -1.0610723495483398, + "logps/chosen": -61.1878662109375, + "logps/rejected": -159.06466674804688, + "loss": 0.1741, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.2823966145515442, + "rewards/margins": 7.47265625, + "rewards/rejected": -7.190260410308838, + "step": 462 + }, + { + "epoch": 0.7, + "learning_rate": 8.192432505199966e-07, + "logits/chosen": -1.2517600059509277, + "logits/rejected": -1.2136309146881104, + "logps/chosen": -68.1708755493164, + "logps/rejected": -165.7637939453125, + "loss": 0.1884, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.06372582912445068, + "rewards/margins": 7.516479969024658, + "rewards/rejected": -7.580205917358398, + "step": 463 + }, + { + "epoch": 0.7, + "learning_rate": 8.182210664994877e-07, + "logits/chosen": -1.2376246452331543, + "logits/rejected": -1.2937442064285278, + "logps/chosen": -62.00231170654297, + "logps/rejected": -131.11273193359375, + "loss": 0.2543, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.1723182648420334, + "rewards/margins": 6.003687381744385, + "rewards/rejected": -5.831368446350098, + "step": 464 + }, + { + "epoch": 0.71, + "learning_rate": 8.171966420818227e-07, + "logits/chosen": -1.1604633331298828, + "logits/rejected": -1.1980036497116089, + "logps/chosen": -68.14649963378906, + "logps/rejected": -130.35452270507812, + "loss": 0.1336, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.35314807295799255, + "rewards/margins": 5.461781978607178, + "rewards/rejected": -5.814929962158203, + "step": 465 + }, + { + "epoch": 0.71, + "learning_rate": 8.161699844793384e-07, + "logits/chosen": -1.22269606590271, + "logits/rejected": -1.1873986721038818, + "logps/chosen": -58.53391647338867, + "logps/rejected": -124.73702239990234, + "loss": 0.2184, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.0939812958240509, + "rewards/margins": 4.914519309997559, + "rewards/rejected": -4.82053804397583, + "step": 466 + }, + { + "epoch": 0.71, + "learning_rate": 8.151411009200941e-07, + "logits/chosen": -0.9256694316864014, + "logits/rejected": -0.9075564742088318, + "logps/chosen": -48.61707305908203, + "logps/rejected": -124.52507019042969, + "loss": 0.1166, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.005536448210477829, + "rewards/margins": 5.7331953048706055, + "rewards/rejected": -5.727659225463867, + "step": 467 + }, + { + "epoch": 0.71, + "learning_rate": 8.141099986478212e-07, + "logits/chosen": -1.1238139867782593, + "logits/rejected": -1.0240187644958496, + "logps/chosen": -69.22377014160156, + "logps/rejected": -145.86741638183594, + "loss": 0.1237, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.32074204087257385, + "rewards/margins": 6.419075965881348, + "rewards/rejected": -6.739818572998047, + "step": 468 + }, + { + "epoch": 0.71, + "learning_rate": 8.130766849218708e-07, + "logits/chosen": -1.5412321090698242, + "logits/rejected": -1.4462655782699585, + "logps/chosen": -47.1432991027832, + "logps/rejected": -125.11244201660156, + "loss": 0.1467, + "rewards/accuracies": 0.8125, + "rewards/chosen": 0.27659279108047485, + "rewards/margins": 5.030959129333496, + "rewards/rejected": -4.754365921020508, + "step": 469 + }, + { + "epoch": 0.71, + "learning_rate": 8.120411670171642e-07, + "logits/chosen": -1.2974660396575928, + "logits/rejected": -1.2382835149765015, + "logps/chosen": -49.07151412963867, + "logps/rejected": -117.94173431396484, + "loss": 0.1567, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.8536134958267212, + "rewards/margins": 5.4294915199279785, + "rewards/rejected": -4.575877666473389, + "step": 470 + }, + { + "epoch": 0.72, + "learning_rate": 8.110034522241407e-07, + "logits/chosen": -1.2068498134613037, + "logits/rejected": -1.191725254058838, + "logps/chosen": -68.56617736816406, + "logps/rejected": -155.53311157226562, + "loss": 0.1059, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.3536372780799866, + "rewards/margins": 6.959292411804199, + "rewards/rejected": -6.605655193328857, + "step": 471 + }, + { + "epoch": 0.72, + "learning_rate": 8.099635478487064e-07, + "logits/chosen": -1.2709250450134277, + "logits/rejected": -1.2953561544418335, + "logps/chosen": -58.28156280517578, + "logps/rejected": -143.70582580566406, + "loss": 0.1743, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.09408218413591385, + "rewards/margins": 7.253283977508545, + "rewards/rejected": -7.1592020988464355, + "step": 472 + }, + { + "epoch": 0.72, + "learning_rate": 8.08921461212183e-07, + "logits/chosen": -1.327064037322998, + "logits/rejected": -1.2830547094345093, + "logps/chosen": -58.22392272949219, + "logps/rejected": -138.6402130126953, + "loss": 0.143, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.7211057543754578, + "rewards/margins": 5.940493106842041, + "rewards/rejected": -5.219387531280518, + "step": 473 + }, + { + "epoch": 0.72, + "learning_rate": 8.078771996512565e-07, + "logits/chosen": -1.2072927951812744, + "logits/rejected": -1.2078496217727661, + "logps/chosen": -54.813236236572266, + "logps/rejected": -107.05663299560547, + "loss": 0.1953, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.3342176079750061, + "rewards/margins": 4.0993547439575195, + "rewards/rejected": -4.433572292327881, + "step": 474 + }, + { + "epoch": 0.72, + "learning_rate": 8.068307705179246e-07, + "logits/chosen": -1.2271807193756104, + "logits/rejected": -1.1826114654541016, + "logps/chosen": -69.37525939941406, + "logps/rejected": -181.13885498046875, + "loss": 0.1718, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.07493260502815247, + "rewards/margins": 8.193195343017578, + "rewards/rejected": -8.118261337280273, + "step": 475 + }, + { + "epoch": 0.72, + "learning_rate": 8.057821811794457e-07, + "logits/chosen": -0.9722121357917786, + "logits/rejected": -1.0049093961715698, + "logps/chosen": -55.88921356201172, + "logps/rejected": -112.35614013671875, + "loss": 0.1317, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.43203961849212646, + "rewards/margins": 4.577415943145752, + "rewards/rejected": -5.009455680847168, + "step": 476 + }, + { + "epoch": 0.72, + "learning_rate": 8.047314390182871e-07, + "logits/chosen": -1.0967211723327637, + "logits/rejected": -1.109215497970581, + "logps/chosen": -47.45706558227539, + "logps/rejected": -87.63819885253906, + "loss": 0.2356, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.3036121428012848, + "rewards/margins": 4.191996097564697, + "rewards/rejected": -3.8883838653564453, + "step": 477 + }, + { + "epoch": 0.73, + "learning_rate": 8.036785514320725e-07, + "logits/chosen": -1.1749989986419678, + "logits/rejected": -1.1614140272140503, + "logps/chosen": -41.67080307006836, + "logps/rejected": -100.26565551757812, + "loss": 0.1679, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.3063531517982483, + "rewards/margins": 4.6997246742248535, + "rewards/rejected": -4.39337158203125, + "step": 478 + }, + { + "epoch": 0.73, + "learning_rate": 8.026235258335306e-07, + "logits/chosen": -1.225369930267334, + "logits/rejected": -1.1531744003295898, + "logps/chosen": -62.20580291748047, + "logps/rejected": -149.44842529296875, + "loss": 0.1963, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.44672101736068726, + "rewards/margins": 6.923643112182617, + "rewards/rejected": -6.476922512054443, + "step": 479 + }, + { + "epoch": 0.73, + "learning_rate": 8.015663696504423e-07, + "logits/chosen": -1.3094971179962158, + "logits/rejected": -1.230414628982544, + "logps/chosen": -57.678443908691406, + "logps/rejected": -137.21197509765625, + "loss": 0.1987, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.13374830782413483, + "rewards/margins": 6.491249084472656, + "rewards/rejected": -6.624998092651367, + "step": 480 + }, + { + "epoch": 0.73, + "learning_rate": 8.005070903255881e-07, + "logits/chosen": -1.253745436668396, + "logits/rejected": -1.2687311172485352, + "logps/chosen": -70.95086669921875, + "logps/rejected": -124.78730773925781, + "loss": 0.1432, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.0652434229850769, + "rewards/margins": 5.830631256103516, + "rewards/rejected": -5.8958740234375, + "step": 481 + }, + { + "epoch": 0.73, + "learning_rate": 7.994456953166972e-07, + "logits/chosen": -1.1457329988479614, + "logits/rejected": -1.1045362949371338, + "logps/chosen": -56.931854248046875, + "logps/rejected": -122.42547607421875, + "loss": 0.1672, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.08954182267189026, + "rewards/margins": 5.752513408660889, + "rewards/rejected": -5.662971496582031, + "step": 482 + }, + { + "epoch": 0.73, + "learning_rate": 7.983821920963935e-07, + "logits/chosen": -1.155022144317627, + "logits/rejected": -1.2023171186447144, + "logps/chosen": -52.90994644165039, + "logps/rejected": -98.7842788696289, + "loss": 0.1284, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.056086815893650055, + "rewards/margins": 4.271111965179443, + "rewards/rejected": -4.327198505401611, + "step": 483 + }, + { + "epoch": 0.74, + "learning_rate": 7.973165881521433e-07, + "logits/chosen": -1.0429362058639526, + "logits/rejected": -1.0378212928771973, + "logps/chosen": -73.509521484375, + "logps/rejected": -155.39572143554688, + "loss": 0.1305, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.593109667301178, + "rewards/margins": 7.16144323348999, + "rewards/rejected": -7.754552841186523, + "step": 484 + }, + { + "epoch": 0.74, + "learning_rate": 7.962488909862033e-07, + "logits/chosen": -1.0125764608383179, + "logits/rejected": -0.977193295955658, + "logps/chosen": -48.4239616394043, + "logps/rejected": -117.32792663574219, + "loss": 0.1391, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.21351009607315063, + "rewards/margins": 6.008133411407471, + "rewards/rejected": -5.794623374938965, + "step": 485 + }, + { + "epoch": 0.74, + "learning_rate": 7.951791081155668e-07, + "logits/chosen": -1.4310243129730225, + "logits/rejected": -1.364517331123352, + "logps/chosen": -66.78572082519531, + "logps/rejected": -142.0985107421875, + "loss": 0.1661, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.2934994101524353, + "rewards/margins": 5.637195110321045, + "rewards/rejected": -5.930694103240967, + "step": 486 + }, + { + "epoch": 0.74, + "learning_rate": 7.941072470719116e-07, + "logits/chosen": -1.2644519805908203, + "logits/rejected": -1.2363717555999756, + "logps/chosen": -76.28678894042969, + "logps/rejected": -141.6587677001953, + "loss": 0.1902, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.9511383175849915, + "rewards/margins": 5.374536991119385, + "rewards/rejected": -6.325675010681152, + "step": 487 + }, + { + "epoch": 0.74, + "learning_rate": 7.930333154015465e-07, + "logits/chosen": -1.2524796724319458, + "logits/rejected": -1.2368062734603882, + "logps/chosen": -55.79368209838867, + "logps/rejected": -108.14014434814453, + "loss": 0.123, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.6211267113685608, + "rewards/margins": 4.497018337249756, + "rewards/rejected": -5.118144989013672, + "step": 488 + }, + { + "epoch": 0.74, + "learning_rate": 7.919573206653582e-07, + "logits/chosen": -1.0712414979934692, + "logits/rejected": -1.0710827112197876, + "logps/chosen": -52.496517181396484, + "logps/rejected": -93.24199676513672, + "loss": 0.2383, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.07469668984413147, + "rewards/margins": 3.50506854057312, + "rewards/rejected": -3.4303717613220215, + "step": 489 + }, + { + "epoch": 0.74, + "learning_rate": 7.908792704387583e-07, + "logits/chosen": -1.2998534440994263, + "logits/rejected": -1.2329963445663452, + "logps/chosen": -59.98297882080078, + "logps/rejected": -114.86883544921875, + "loss": 0.1563, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.42221516370773315, + "rewards/margins": 4.252401351928711, + "rewards/rejected": -4.674615859985352, + "step": 490 + }, + { + "epoch": 0.75, + "learning_rate": 7.8979917231163e-07, + "logits/chosen": -1.4007737636566162, + "logits/rejected": -1.4391206502914429, + "logps/chosen": -85.83271789550781, + "logps/rejected": -160.2294921875, + "loss": 0.1032, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.0596859455108643, + "rewards/margins": 5.792131423950195, + "rewards/rejected": -6.8518171310424805, + "step": 491 + }, + { + "epoch": 0.75, + "learning_rate": 7.88717033888274e-07, + "logits/chosen": -1.094614863395691, + "logits/rejected": -1.0967289209365845, + "logps/chosen": -75.23745727539062, + "logps/rejected": -125.14196014404297, + "loss": 0.1518, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.2838397026062012, + "rewards/margins": 4.288742542266846, + "rewards/rejected": -5.572582721710205, + "step": 492 + }, + { + "epoch": 0.75, + "learning_rate": 7.876328627873561e-07, + "logits/chosen": -1.2592332363128662, + "logits/rejected": -1.2118233442306519, + "logps/chosen": -60.95644760131836, + "logps/rejected": -116.6041259765625, + "loss": 0.1176, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.2381046712398529, + "rewards/margins": 4.668372631072998, + "rewards/rejected": -4.430268287658691, + "step": 493 + }, + { + "epoch": 0.75, + "learning_rate": 7.865466666418521e-07, + "logits/chosen": -1.3860349655151367, + "logits/rejected": -1.296807885169983, + "logps/chosen": -66.50694274902344, + "logps/rejected": -189.37033081054688, + "loss": 0.1512, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.0313073992729187, + "rewards/margins": 9.400177955627441, + "rewards/rejected": -9.36886978149414, + "step": 494 + }, + { + "epoch": 0.75, + "learning_rate": 7.854584530989956e-07, + "logits/chosen": -1.0123242139816284, + "logits/rejected": -1.0580317974090576, + "logps/chosen": -53.1533088684082, + "logps/rejected": -86.56595611572266, + "loss": 0.184, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.8372935056686401, + "rewards/margins": 3.333448648452759, + "rewards/rejected": -4.170742034912109, + "step": 495 + }, + { + "epoch": 0.75, + "learning_rate": 7.843682298202234e-07, + "logits/chosen": -1.0129303932189941, + "logits/rejected": -0.9923812747001648, + "logps/chosen": -44.199432373046875, + "logps/rejected": -97.03451538085938, + "loss": 0.1203, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.19193431735038757, + "rewards/margins": 4.424201011657715, + "rewards/rejected": -4.232266426086426, + "step": 496 + }, + { + "epoch": 0.76, + "learning_rate": 7.83276004481121e-07, + "logits/chosen": -1.1978256702423096, + "logits/rejected": -1.222652554512024, + "logps/chosen": -76.8789291381836, + "logps/rejected": -130.401123046875, + "loss": 0.201, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.2580636739730835, + "rewards/margins": 4.810042381286621, + "rewards/rejected": -6.068106651306152, + "step": 497 + }, + { + "epoch": 0.76, + "learning_rate": 7.821817847713701e-07, + "logits/chosen": -1.1676831245422363, + "logits/rejected": -1.07413649559021, + "logps/chosen": -70.05974578857422, + "logps/rejected": -140.12713623046875, + "loss": 0.1508, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.9246506690979004, + "rewards/margins": 5.467977046966553, + "rewards/rejected": -6.392627716064453, + "step": 498 + }, + { + "epoch": 0.76, + "learning_rate": 7.810855783946926e-07, + "logits/chosen": -1.2983942031860352, + "logits/rejected": -1.2959508895874023, + "logps/chosen": -61.566444396972656, + "logps/rejected": -148.5758514404297, + "loss": 0.1084, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.5822048187255859, + "rewards/margins": 7.1837873458862305, + "rewards/rejected": -7.765992641448975, + "step": 499 + }, + { + "epoch": 0.76, + "learning_rate": 7.799873930687977e-07, + "logits/chosen": -1.300523042678833, + "logits/rejected": -1.2762212753295898, + "logps/chosen": -63.04548263549805, + "logps/rejected": -120.72624206542969, + "loss": 0.0995, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.772101879119873, + "rewards/margins": 4.946941375732422, + "rewards/rejected": -5.719043254852295, + "step": 500 + }, + { + "epoch": 0.76, + "learning_rate": 7.788872365253271e-07, + "logits/chosen": -1.1984803676605225, + "logits/rejected": -1.142225742340088, + "logps/chosen": -57.733360290527344, + "logps/rejected": -120.88352966308594, + "loss": 0.1906, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.4514312148094177, + "rewards/margins": 4.9673752784729, + "rewards/rejected": -5.418806076049805, + "step": 501 + }, + { + "epoch": 0.76, + "learning_rate": 7.777851165098011e-07, + "logits/chosen": -1.0005409717559814, + "logits/rejected": -1.0943784713745117, + "logps/chosen": -84.2463607788086, + "logps/rejected": -187.10287475585938, + "loss": 0.1315, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.6938743591308594, + "rewards/margins": 8.654955863952637, + "rewards/rejected": -9.348830223083496, + "step": 502 + }, + { + "epoch": 0.76, + "learning_rate": 7.766810407815628e-07, + "logits/chosen": -1.0287072658538818, + "logits/rejected": -1.0100324153900146, + "logps/chosen": -42.83229446411133, + "logps/rejected": -114.5429458618164, + "loss": 0.1766, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.15994618833065033, + "rewards/margins": 5.648642539978027, + "rewards/rejected": -5.488696575164795, + "step": 503 + }, + { + "epoch": 0.77, + "learning_rate": 7.755750171137244e-07, + "logits/chosen": -1.1835378408432007, + "logits/rejected": -1.1462066173553467, + "logps/chosen": -88.63304901123047, + "logps/rejected": -169.634033203125, + "loss": 0.1234, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.4358795881271362, + "rewards/margins": 6.794912338256836, + "rewards/rejected": -8.230792045593262, + "step": 504 + }, + { + "epoch": 0.77, + "learning_rate": 7.74467053293113e-07, + "logits/chosen": -0.9608476161956787, + "logits/rejected": -0.9537318348884583, + "logps/chosen": -71.47065734863281, + "logps/rejected": -114.87274169921875, + "loss": 0.1906, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.2017298936843872, + "rewards/margins": 4.127964973449707, + "rewards/rejected": -5.329694747924805, + "step": 505 + }, + { + "epoch": 0.77, + "learning_rate": 7.733571571202144e-07, + "logits/chosen": -1.047137975692749, + "logits/rejected": -0.9563291072845459, + "logps/chosen": -78.42756652832031, + "logps/rejected": -176.69107055664062, + "loss": 0.149, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.3879927396774292, + "rewards/margins": 7.720418930053711, + "rewards/rejected": -9.10841178894043, + "step": 506 + }, + { + "epoch": 0.77, + "learning_rate": 7.722453364091193e-07, + "logits/chosen": -1.3432402610778809, + "logits/rejected": -1.3647172451019287, + "logps/chosen": -54.534358978271484, + "logps/rejected": -105.29655456542969, + "loss": 0.1106, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.03609006106853485, + "rewards/margins": 4.315974235534668, + "rewards/rejected": -4.35206413269043, + "step": 507 + }, + { + "epoch": 0.77, + "learning_rate": 7.711315989874676e-07, + "logits/chosen": -1.2090696096420288, + "logits/rejected": -1.1107300519943237, + "logps/chosen": -72.83460998535156, + "logps/rejected": -151.87924194335938, + "loss": 0.1057, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.6429700255393982, + "rewards/margins": 5.898809432983398, + "rewards/rejected": -6.5417799949646, + "step": 508 + }, + { + "epoch": 0.77, + "learning_rate": 7.700159526963936e-07, + "logits/chosen": -1.324645757675171, + "logits/rejected": -1.3539719581604004, + "logps/chosen": -85.37342834472656, + "logps/rejected": -151.45509338378906, + "loss": 0.1649, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.5722832679748535, + "rewards/margins": 5.511928081512451, + "rewards/rejected": -7.0842108726501465, + "step": 509 + }, + { + "epoch": 0.77, + "learning_rate": 7.688984053904713e-07, + "logits/chosen": -1.3109506368637085, + "logits/rejected": -1.2842274904251099, + "logps/chosen": -60.5789680480957, + "logps/rejected": -147.7485809326172, + "loss": 0.1137, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.20602567493915558, + "rewards/margins": 7.15122652053833, + "rewards/rejected": -7.3572516441345215, + "step": 510 + }, + { + "epoch": 0.78, + "learning_rate": 7.677789649376575e-07, + "logits/chosen": -1.1558939218521118, + "logits/rejected": -1.1040030717849731, + "logps/chosen": -62.67195129394531, + "logps/rejected": -129.86436462402344, + "loss": 0.1495, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.3074299693107605, + "rewards/margins": 5.887986183166504, + "rewards/rejected": -6.195416450500488, + "step": 511 + }, + { + "epoch": 0.78, + "learning_rate": 7.666576392192388e-07, + "logits/chosen": -1.19840669631958, + "logits/rejected": -1.178692102432251, + "logps/chosen": -58.70737838745117, + "logps/rejected": -129.68954467773438, + "loss": 0.1251, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.15469014644622803, + "rewards/margins": 6.070980072021484, + "rewards/rejected": -6.225669860839844, + "step": 512 + }, + { + "epoch": 0.78, + "learning_rate": 7.655344361297735e-07, + "logits/chosen": -1.0739836692810059, + "logits/rejected": -1.0780495405197144, + "logps/chosen": -64.47042846679688, + "logps/rejected": -133.5172576904297, + "loss": 0.1737, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.27823784947395325, + "rewards/margins": 6.497269630432129, + "rewards/rejected": -6.775506973266602, + "step": 513 + }, + { + "epoch": 0.78, + "learning_rate": 7.644093635770384e-07, + "logits/chosen": -1.2686853408813477, + "logits/rejected": -1.2785826921463013, + "logps/chosen": -60.34698486328125, + "logps/rejected": -119.00567626953125, + "loss": 0.0953, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.033090412616729736, + "rewards/margins": 4.969561576843262, + "rewards/rejected": -4.936470985412598, + "step": 514 + }, + { + "epoch": 0.78, + "learning_rate": 7.632824294819711e-07, + "logits/chosen": -1.1431541442871094, + "logits/rejected": -1.0436018705368042, + "logps/chosen": -60.22837448120117, + "logps/rejected": -147.8499755859375, + "loss": 0.1022, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.5924397706985474, + "rewards/margins": 6.656856060028076, + "rewards/rejected": -7.249295711517334, + "step": 515 + }, + { + "epoch": 0.78, + "learning_rate": 7.621536417786158e-07, + "logits/chosen": -1.0664671659469604, + "logits/rejected": -1.052040696144104, + "logps/chosen": -43.781002044677734, + "logps/rejected": -87.1745834350586, + "loss": 0.1714, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.9652446508407593, + "rewards/margins": 4.096286773681641, + "rewards/rejected": -3.131042242050171, + "step": 516 + }, + { + "epoch": 0.79, + "learning_rate": 7.610230084140667e-07, + "logits/chosen": -1.2166173458099365, + "logits/rejected": -1.1928117275238037, + "logps/chosen": -68.93216705322266, + "logps/rejected": -122.04391479492188, + "loss": 0.1437, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.0721417665481567, + "rewards/margins": 4.350822925567627, + "rewards/rejected": -5.422965049743652, + "step": 517 + }, + { + "epoch": 0.79, + "learning_rate": 7.598905373484119e-07, + "logits/chosen": -1.1036087274551392, + "logits/rejected": -1.0387972593307495, + "logps/chosen": -67.01268768310547, + "logps/rejected": -168.45962524414062, + "loss": 0.1378, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.06719581037759781, + "rewards/margins": 7.95098876953125, + "rewards/rejected": -8.018184661865234, + "step": 518 + }, + { + "epoch": 0.79, + "learning_rate": 7.587562365546776e-07, + "logits/chosen": -1.326158046722412, + "logits/rejected": -1.3189661502838135, + "logps/chosen": -66.62142944335938, + "logps/rejected": -146.9497528076172, + "loss": 0.2035, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.548250675201416, + "rewards/margins": 6.232966899871826, + "rewards/rejected": -6.781217575073242, + "step": 519 + }, + { + "epoch": 0.79, + "learning_rate": 7.576201140187725e-07, + "logits/chosen": -0.9910123944282532, + "logits/rejected": -1.0392197370529175, + "logps/chosen": -69.34978485107422, + "logps/rejected": -113.86421966552734, + "loss": 0.1181, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.5841732025146484, + "rewards/margins": 4.638482093811035, + "rewards/rejected": -5.222655296325684, + "step": 520 + }, + { + "epoch": 0.79, + "learning_rate": 7.564821777394306e-07, + "logits/chosen": -1.0627343654632568, + "logits/rejected": -1.0298466682434082, + "logps/chosen": -64.04617309570312, + "logps/rejected": -119.627685546875, + "loss": 0.1754, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.22364619374275208, + "rewards/margins": 4.598440170288086, + "rewards/rejected": -4.822085857391357, + "step": 521 + }, + { + "epoch": 0.79, + "learning_rate": 7.553424357281555e-07, + "logits/chosen": -1.1028233766555786, + "logits/rejected": -1.0683115720748901, + "logps/chosen": -65.89503479003906, + "logps/rejected": -132.91787719726562, + "loss": 0.1563, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.12011255323886871, + "rewards/margins": 6.085408687591553, + "rewards/rejected": -6.205521583557129, + "step": 522 + }, + { + "epoch": 0.79, + "learning_rate": 7.542008960091635e-07, + "logits/chosen": -1.1913988590240479, + "logits/rejected": -1.1794086694717407, + "logps/chosen": -68.01545715332031, + "logps/rejected": -131.48208618164062, + "loss": 0.0884, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.9747741222381592, + "rewards/margins": 5.525202751159668, + "rewards/rejected": -6.499977111816406, + "step": 523 + }, + { + "epoch": 0.8, + "learning_rate": 7.530575666193282e-07, + "logits/chosen": -1.0506024360656738, + "logits/rejected": -1.0830491781234741, + "logps/chosen": -70.56978607177734, + "logps/rejected": -100.9091796875, + "loss": 0.1489, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.2066999673843384, + "rewards/margins": 2.789134979248047, + "rewards/rejected": -3.9958345890045166, + "step": 524 + }, + { + "epoch": 0.8, + "learning_rate": 7.519124556081222e-07, + "logits/chosen": -1.0187995433807373, + "logits/rejected": -1.0285779237747192, + "logps/chosen": -65.80451202392578, + "logps/rejected": -122.89828491210938, + "loss": 0.2987, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.276435375213623, + "rewards/margins": 4.4974870681762695, + "rewards/rejected": -5.773921966552734, + "step": 525 + }, + { + "epoch": 0.8, + "learning_rate": 7.507655710375621e-07, + "logits/chosen": -1.194640874862671, + "logits/rejected": -1.1428656578063965, + "logps/chosen": -94.79371643066406, + "logps/rejected": -182.0157470703125, + "loss": 0.2202, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.2120563983917236, + "rewards/margins": 7.609585762023926, + "rewards/rejected": -8.82164192199707, + "step": 526 + }, + { + "epoch": 0.8, + "learning_rate": 7.49616920982151e-07, + "logits/chosen": -1.3102431297302246, + "logits/rejected": -1.3399966955184937, + "logps/chosen": -63.27342224121094, + "logps/rejected": -119.10459899902344, + "loss": 0.2103, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.139730766415596, + "rewards/margins": 5.59067964553833, + "rewards/rejected": -5.730410575866699, + "step": 527 + }, + { + "epoch": 0.8, + "learning_rate": 7.484665135288213e-07, + "logits/chosen": -1.1653929948806763, + "logits/rejected": -1.1708984375, + "logps/chosen": -59.638404846191406, + "logps/rejected": -116.32239532470703, + "loss": 0.161, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.47278904914855957, + "rewards/margins": 4.590404987335205, + "rewards/rejected": -5.063194274902344, + "step": 528 + }, + { + "epoch": 0.8, + "learning_rate": 7.473143567768785e-07, + "logits/chosen": -1.0732041597366333, + "logits/rejected": -1.0529849529266357, + "logps/chosen": -59.536346435546875, + "logps/rejected": -114.80162048339844, + "loss": 0.2172, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.2741459906101227, + "rewards/margins": 4.777864933013916, + "rewards/rejected": -5.052010536193848, + "step": 529 + }, + { + "epoch": 0.81, + "learning_rate": 7.461604588379435e-07, + "logits/chosen": -0.9924188852310181, + "logits/rejected": -1.008417010307312, + "logps/chosen": -64.02005004882812, + "logps/rejected": -123.69132995605469, + "loss": 0.0818, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.6893237829208374, + "rewards/margins": 4.819268703460693, + "rewards/rejected": -6.50859260559082, + "step": 530 + }, + { + "epoch": 0.81, + "learning_rate": 7.450048278358961e-07, + "logits/chosen": -1.071373462677002, + "logits/rejected": -1.0413519144058228, + "logps/chosen": -56.579383850097656, + "logps/rejected": -115.09782409667969, + "loss": 0.198, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.5916543006896973, + "rewards/margins": 5.022871017456055, + "rewards/rejected": -5.61452579498291, + "step": 531 + }, + { + "epoch": 0.81, + "learning_rate": 7.438474719068173e-07, + "logits/chosen": -1.0422636270523071, + "logits/rejected": -0.933167576789856, + "logps/chosen": -87.04025268554688, + "logps/rejected": -203.26632690429688, + "loss": 0.1469, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.167875051498413, + "rewards/margins": 8.060665130615234, + "rewards/rejected": -9.22853946685791, + "step": 532 + }, + { + "epoch": 0.81, + "learning_rate": 7.426883991989324e-07, + "logits/chosen": -1.1454071998596191, + "logits/rejected": -1.075371265411377, + "logps/chosen": -72.74165344238281, + "logps/rejected": -129.0819854736328, + "loss": 0.0906, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.2687061131000519, + "rewards/margins": 5.253970623016357, + "rewards/rejected": -5.522676467895508, + "step": 533 + }, + { + "epoch": 0.81, + "learning_rate": 7.415276178725537e-07, + "logits/chosen": -1.0539616346359253, + "logits/rejected": -1.0729026794433594, + "logps/chosen": -46.86689376831055, + "logps/rejected": -89.4149398803711, + "loss": 0.1801, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.18019194900989532, + "rewards/margins": 3.8524868488311768, + "rewards/rejected": -4.032678604125977, + "step": 534 + }, + { + "epoch": 0.81, + "learning_rate": 7.403651361000223e-07, + "logits/chosen": -1.3698267936706543, + "logits/rejected": -1.2763550281524658, + "logps/chosen": -56.74170684814453, + "logps/rejected": -132.7080535888672, + "loss": 0.1605, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.5413241386413574, + "rewards/margins": 5.412938117980957, + "rewards/rejected": -5.954262733459473, + "step": 535 + }, + { + "epoch": 0.81, + "learning_rate": 7.392009620656511e-07, + "logits/chosen": -1.1172040700912476, + "logits/rejected": -1.0188947916030884, + "logps/chosen": -50.54315948486328, + "logps/rejected": -130.28665161132812, + "loss": 0.0991, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.25699636340141296, + "rewards/margins": 6.52001953125, + "rewards/rejected": -6.2630228996276855, + "step": 536 + }, + { + "epoch": 0.82, + "learning_rate": 7.38035103965668e-07, + "logits/chosen": -1.1840345859527588, + "logits/rejected": -1.196291446685791, + "logps/chosen": -39.160369873046875, + "logps/rejected": -95.95783996582031, + "loss": 0.1399, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.5449946522712708, + "rewards/margins": 5.224076271057129, + "rewards/rejected": -4.679081916809082, + "step": 537 + }, + { + "epoch": 0.82, + "learning_rate": 7.368675700081564e-07, + "logits/chosen": -1.2831922769546509, + "logits/rejected": -1.2046195268630981, + "logps/chosen": -77.98712921142578, + "logps/rejected": -171.9889678955078, + "loss": 0.0773, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.9342814683914185, + "rewards/margins": 7.7829389572143555, + "rewards/rejected": -8.717220306396484, + "step": 538 + }, + { + "epoch": 0.82, + "learning_rate": 7.356983684129989e-07, + "logits/chosen": -1.4122549295425415, + "logits/rejected": -1.3278307914733887, + "logps/chosen": -60.05818176269531, + "logps/rejected": -172.47239685058594, + "loss": 0.0933, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.32834285497665405, + "rewards/margins": 8.709205627441406, + "rewards/rejected": -9.037548065185547, + "step": 539 + }, + { + "epoch": 0.82, + "learning_rate": 7.345275074118185e-07, + "logits/chosen": -0.9864742755889893, + "logits/rejected": -1.0224814414978027, + "logps/chosen": -55.72515106201172, + "logps/rejected": -108.23019409179688, + "loss": 0.205, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.12911073863506317, + "rewards/margins": 4.800266265869141, + "rewards/rejected": -4.92937707901001, + "step": 540 + }, + { + "epoch": 0.82, + "learning_rate": 7.333549952479214e-07, + "logits/chosen": -1.0197805166244507, + "logits/rejected": -0.9988608360290527, + "logps/chosen": -77.59547424316406, + "logps/rejected": -171.5909423828125, + "loss": 0.1427, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.20403766632080078, + "rewards/margins": 7.874151229858398, + "rewards/rejected": -8.078189849853516, + "step": 541 + }, + { + "epoch": 0.82, + "learning_rate": 7.321808401762389e-07, + "logits/chosen": -1.1090089082717896, + "logits/rejected": -1.0911915302276611, + "logps/chosen": -67.35092163085938, + "logps/rejected": -138.5771942138672, + "loss": 0.1552, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.2441098392009735, + "rewards/margins": 6.130369186401367, + "rewards/rejected": -6.374478816986084, + "step": 542 + }, + { + "epoch": 0.82, + "learning_rate": 7.310050504632679e-07, + "logits/chosen": -1.143861174583435, + "logits/rejected": -1.1029807329177856, + "logps/chosen": -77.17582702636719, + "logps/rejected": -171.35406494140625, + "loss": 0.0631, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.21549063920974731, + "rewards/margins": 8.488594055175781, + "rewards/rejected": -8.704084396362305, + "step": 543 + }, + { + "epoch": 0.83, + "learning_rate": 7.298276343870151e-07, + "logits/chosen": -1.391092300415039, + "logits/rejected": -1.4528110027313232, + "logps/chosen": -78.30352020263672, + "logps/rejected": -143.43536376953125, + "loss": 0.141, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.030500110238790512, + "rewards/margins": 6.488924980163574, + "rewards/rejected": -6.4584245681762695, + "step": 544 + }, + { + "epoch": 0.83, + "learning_rate": 7.286486002369365e-07, + "logits/chosen": -1.1510714292526245, + "logits/rejected": -1.0716849565505981, + "logps/chosen": -66.09734344482422, + "logps/rejected": -164.27682495117188, + "loss": 0.0946, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.6337064504623413, + "rewards/margins": 8.143341064453125, + "rewards/rejected": -8.777047157287598, + "step": 545 + }, + { + "epoch": 0.83, + "learning_rate": 7.274679563138804e-07, + "logits/chosen": -1.1280611753463745, + "logits/rejected": -1.1545814275741577, + "logps/chosen": -53.178977966308594, + "logps/rejected": -125.91136932373047, + "loss": 0.1733, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.28666895627975464, + "rewards/margins": 5.748191833496094, + "rewards/rejected": -6.034860134124756, + "step": 546 + }, + { + "epoch": 0.83, + "learning_rate": 7.262857109300282e-07, + "logits/chosen": -1.1145057678222656, + "logits/rejected": -1.0593397617340088, + "logps/chosen": -86.71113586425781, + "logps/rejected": -195.0640869140625, + "loss": 0.1717, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.8704631924629211, + "rewards/margins": 8.772705078125, + "rewards/rejected": -9.643167495727539, + "step": 547 + }, + { + "epoch": 0.83, + "learning_rate": 7.251018724088366e-07, + "logits/chosen": -1.4092875719070435, + "logits/rejected": -1.3813461065292358, + "logps/chosen": -50.07902526855469, + "logps/rejected": -97.85746765136719, + "loss": 0.0668, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.31925973296165466, + "rewards/margins": 4.2447614669799805, + "rewards/rejected": -4.564021110534668, + "step": 548 + }, + { + "epoch": 0.83, + "learning_rate": 7.239164490849783e-07, + "logits/chosen": -1.1854861974716187, + "logits/rejected": -1.1800771951675415, + "logps/chosen": -80.69862365722656, + "logps/rejected": -177.3069305419922, + "loss": 0.1178, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.1843868494033813, + "rewards/margins": 7.85844087600708, + "rewards/rejected": -9.042828559875488, + "step": 549 + }, + { + "epoch": 0.84, + "learning_rate": 7.227294493042837e-07, + "logits/chosen": -1.6006877422332764, + "logits/rejected": -1.5721409320831299, + "logps/chosen": -53.85042190551758, + "logps/rejected": -107.69651794433594, + "loss": 0.1282, + "rewards/accuracies": 0.8125, + "rewards/chosen": 0.15171164274215698, + "rewards/margins": 4.337007522583008, + "rewards/rejected": -4.185296535491943, + "step": 550 + }, + { + "epoch": 0.84, + "learning_rate": 7.215408814236818e-07, + "logits/chosen": -1.2671542167663574, + "logits/rejected": -1.2413151264190674, + "logps/chosen": -75.32283020019531, + "logps/rejected": -147.53758239746094, + "loss": 0.0976, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.085512638092041, + "rewards/margins": 6.20933723449707, + "rewards/rejected": -7.294849395751953, + "step": 551 + }, + { + "epoch": 0.84, + "learning_rate": 7.203507538111421e-07, + "logits/chosen": -1.025465488433838, + "logits/rejected": -1.0870991945266724, + "logps/chosen": -59.177955627441406, + "logps/rejected": -120.17818450927734, + "loss": 0.1112, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.023819267749786377, + "rewards/margins": 5.563221454620361, + "rewards/rejected": -5.539402008056641, + "step": 552 + }, + { + "epoch": 0.84, + "learning_rate": 7.19159074845615e-07, + "logits/chosen": -1.0638189315795898, + "logits/rejected": -1.1864960193634033, + "logps/chosen": -90.62945556640625, + "logps/rejected": -163.56187438964844, + "loss": 0.1703, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.929221510887146, + "rewards/margins": 6.858667373657227, + "rewards/rejected": -8.787888526916504, + "step": 553 + }, + { + "epoch": 0.84, + "learning_rate": 7.179658529169727e-07, + "logits/chosen": -1.1522774696350098, + "logits/rejected": -1.1589596271514893, + "logps/chosen": -48.853660583496094, + "logps/rejected": -120.89286804199219, + "loss": 0.1392, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.2509755492210388, + "rewards/margins": 6.310558795928955, + "rewards/rejected": -6.05958366394043, + "step": 554 + }, + { + "epoch": 0.84, + "learning_rate": 7.16771096425951e-07, + "logits/chosen": -1.0318248271942139, + "logits/rejected": -1.0054022073745728, + "logps/chosen": -67.14990997314453, + "logps/rejected": -145.93971252441406, + "loss": 0.1306, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.8901978731155396, + "rewards/margins": 6.804965972900391, + "rewards/rejected": -7.695164203643799, + "step": 555 + }, + { + "epoch": 0.84, + "learning_rate": 7.155748137840892e-07, + "logits/chosen": -0.9326344728469849, + "logits/rejected": -0.9701443314552307, + "logps/chosen": -74.25255584716797, + "logps/rejected": -122.07107543945312, + "loss": 0.1298, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.6644315719604492, + "rewards/margins": 4.801297187805176, + "rewards/rejected": -5.465729236602783, + "step": 556 + }, + { + "epoch": 0.85, + "learning_rate": 7.143770134136713e-07, + "logits/chosen": -1.305212140083313, + "logits/rejected": -1.285457968711853, + "logps/chosen": -80.28656768798828, + "logps/rejected": -155.32589721679688, + "loss": 0.14, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.7364399433135986, + "rewards/margins": 5.878342628479004, + "rewards/rejected": -7.614782333374023, + "step": 557 + }, + { + "epoch": 0.85, + "learning_rate": 7.131777037476668e-07, + "logits/chosen": -1.2526302337646484, + "logits/rejected": -1.2162461280822754, + "logps/chosen": -74.2842788696289, + "logps/rejected": -152.82522583007812, + "loss": 0.0943, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.8585233688354492, + "rewards/margins": 6.765040874481201, + "rewards/rejected": -7.62356424331665, + "step": 558 + }, + { + "epoch": 0.85, + "learning_rate": 7.119768932296715e-07, + "logits/chosen": -1.1978999376296997, + "logits/rejected": -1.1723815202713013, + "logps/chosen": -50.21682357788086, + "logps/rejected": -119.23377990722656, + "loss": 0.1388, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.23364275693893433, + "rewards/margins": 5.576208591461182, + "rewards/rejected": -5.342565059661865, + "step": 559 + }, + { + "epoch": 0.85, + "learning_rate": 7.107745903138471e-07, + "logits/chosen": -1.3260363340377808, + "logits/rejected": -1.3038979768753052, + "logps/chosen": -54.60324478149414, + "logps/rejected": -108.67857360839844, + "loss": 0.131, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.049276888370513916, + "rewards/margins": 5.071126461029053, + "rewards/rejected": -5.120403289794922, + "step": 560 + }, + { + "epoch": 0.85, + "learning_rate": 7.095708034648629e-07, + "logits/chosen": -1.2655694484710693, + "logits/rejected": -1.4480382204055786, + "logps/chosen": -89.42362976074219, + "logps/rejected": -167.7763671875, + "loss": 0.0957, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.0199987888336182, + "rewards/margins": 7.234724521636963, + "rewards/rejected": -8.25472354888916, + "step": 561 + }, + { + "epoch": 0.85, + "learning_rate": 7.083655411578355e-07, + "logits/chosen": -1.3500678539276123, + "logits/rejected": -1.3585700988769531, + "logps/chosen": -48.68550109863281, + "logps/rejected": -109.5958480834961, + "loss": 0.1312, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.08222457766532898, + "rewards/margins": 5.2090911865234375, + "rewards/rejected": -5.126866817474365, + "step": 562 + }, + { + "epoch": 0.86, + "learning_rate": 7.071588118782692e-07, + "logits/chosen": -1.164905309677124, + "logits/rejected": -1.1342130899429321, + "logps/chosen": -64.54033660888672, + "logps/rejected": -140.8212127685547, + "loss": 0.1478, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.039550647139549255, + "rewards/margins": 6.7797017097473145, + "rewards/rejected": -6.819252014160156, + "step": 563 + }, + { + "epoch": 0.86, + "learning_rate": 7.059506241219964e-07, + "logits/chosen": -1.1916640996932983, + "logits/rejected": -1.2581453323364258, + "logps/chosen": -68.16947937011719, + "logps/rejected": -125.39520263671875, + "loss": 0.164, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.5781932473182678, + "rewards/margins": 5.795152187347412, + "rewards/rejected": -6.373345375061035, + "step": 564 + }, + { + "epoch": 0.86, + "learning_rate": 7.047409863951176e-07, + "logits/chosen": -1.318346381187439, + "logits/rejected": -1.313532829284668, + "logps/chosen": -65.78179931640625, + "logps/rejected": -132.06607055664062, + "loss": 0.1061, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.21447968482971191, + "rewards/margins": 6.131742477416992, + "rewards/rejected": -6.346221923828125, + "step": 565 + }, + { + "epoch": 0.86, + "learning_rate": 7.035299072139419e-07, + "logits/chosen": -1.2092915773391724, + "logits/rejected": -1.2645493745803833, + "logps/chosen": -60.975196838378906, + "logps/rejected": -91.39009094238281, + "loss": 0.1434, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.0362534523010254, + "rewards/margins": 3.428194999694824, + "rewards/rejected": -4.46444845199585, + "step": 566 + }, + { + "epoch": 0.86, + "learning_rate": 7.023173951049267e-07, + "logits/chosen": -1.1492639780044556, + "logits/rejected": -1.1428110599517822, + "logps/chosen": -89.39848327636719, + "logps/rejected": -193.7574462890625, + "loss": 0.1345, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.329371690750122, + "rewards/margins": 8.847994804382324, + "rewards/rejected": -10.177366256713867, + "step": 567 + }, + { + "epoch": 0.86, + "learning_rate": 7.011034586046176e-07, + "logits/chosen": -1.1973826885223389, + "logits/rejected": -1.2077950239181519, + "logps/chosen": -59.40647888183594, + "logps/rejected": -117.23844909667969, + "loss": 0.1824, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.42822229862213135, + "rewards/margins": 5.149479866027832, + "rewards/rejected": -5.577702045440674, + "step": 568 + }, + { + "epoch": 0.86, + "learning_rate": 6.998881062595886e-07, + "logits/chosen": -1.154420018196106, + "logits/rejected": -1.1807911396026611, + "logps/chosen": -57.72307586669922, + "logps/rejected": -118.18482208251953, + "loss": 0.117, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.6299920082092285, + "rewards/margins": 5.129215240478516, + "rewards/rejected": -5.759207248687744, + "step": 569 + }, + { + "epoch": 0.87, + "learning_rate": 6.986713466263817e-07, + "logits/chosen": -1.1156989336013794, + "logits/rejected": -1.0888198614120483, + "logps/chosen": -66.96090698242188, + "logps/rejected": -145.86105346679688, + "loss": 0.1571, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.1332242488861084, + "rewards/margins": 6.20496940612793, + "rewards/rejected": -7.338193416595459, + "step": 570 + }, + { + "epoch": 0.87, + "learning_rate": 6.974531882714471e-07, + "logits/chosen": -1.1809208393096924, + "logits/rejected": -1.129634141921997, + "logps/chosen": -64.9968032836914, + "logps/rejected": -133.65103149414062, + "loss": 0.1651, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.7019888758659363, + "rewards/margins": 5.911318302154541, + "rewards/rejected": -6.613307476043701, + "step": 571 + }, + { + "epoch": 0.87, + "learning_rate": 6.962336397710819e-07, + "logits/chosen": -1.1546497344970703, + "logits/rejected": -1.1487202644348145, + "logps/chosen": -78.28816223144531, + "logps/rejected": -186.17295837402344, + "loss": 0.1536, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.483786940574646, + "rewards/margins": 8.131475448608398, + "rewards/rejected": -9.615262031555176, + "step": 572 + }, + { + "epoch": 0.87, + "learning_rate": 6.950127097113707e-07, + "logits/chosen": -0.9886271357536316, + "logits/rejected": -1.040745735168457, + "logps/chosen": -66.25126647949219, + "logps/rejected": -147.1013946533203, + "loss": 0.1091, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.27855655550956726, + "rewards/margins": 7.534342288970947, + "rewards/rejected": -7.812899112701416, + "step": 573 + }, + { + "epoch": 0.87, + "learning_rate": 6.93790406688125e-07, + "logits/chosen": -1.3215430974960327, + "logits/rejected": -1.2974023818969727, + "logps/chosen": -53.79179000854492, + "logps/rejected": -101.81126403808594, + "loss": 0.1484, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.5321861505508423, + "rewards/margins": 3.803168296813965, + "rewards/rejected": -4.335354328155518, + "step": 574 + }, + { + "epoch": 0.87, + "learning_rate": 6.92566739306822e-07, + "logits/chosen": -1.151102900505066, + "logits/rejected": -1.27048921585083, + "logps/chosen": -72.40945434570312, + "logps/rejected": -127.10850524902344, + "loss": 0.1173, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.7070590257644653, + "rewards/margins": 4.9336981773376465, + "rewards/rejected": -6.640757083892822, + "step": 575 + }, + { + "epoch": 0.88, + "learning_rate": 6.913417161825449e-07, + "logits/chosen": -1.1125006675720215, + "logits/rejected": -1.107159972190857, + "logps/chosen": -80.56257629394531, + "logps/rejected": -132.2965087890625, + "loss": 0.1024, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.789968729019165, + "rewards/margins": 5.229074001312256, + "rewards/rejected": -7.019042491912842, + "step": 576 + }, + { + "epoch": 0.88, + "learning_rate": 6.901153459399217e-07, + "logits/chosen": -1.2071489095687866, + "logits/rejected": -1.1959619522094727, + "logps/chosen": -76.89067840576172, + "logps/rejected": -159.0953826904297, + "loss": 0.1308, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.8559060096740723, + "rewards/margins": 6.734124660491943, + "rewards/rejected": -8.590030670166016, + "step": 577 + }, + { + "epoch": 0.88, + "learning_rate": 6.888876372130646e-07, + "logits/chosen": -1.2549490928649902, + "logits/rejected": -1.1717318296432495, + "logps/chosen": -84.55844116210938, + "logps/rejected": -171.13616943359375, + "loss": 0.1948, + "rewards/accuracies": 0.9375, + "rewards/chosen": -2.0151565074920654, + "rewards/margins": 6.452093601226807, + "rewards/rejected": -8.467248916625977, + "step": 578 + }, + { + "epoch": 0.88, + "learning_rate": 6.876585986455095e-07, + "logits/chosen": -1.0063295364379883, + "logits/rejected": -1.043433427810669, + "logps/chosen": -61.25851058959961, + "logps/rejected": -123.28639221191406, + "loss": 0.1691, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.2367548942565918, + "rewards/margins": 5.889279842376709, + "rewards/rejected": -7.126034736633301, + "step": 579 + }, + { + "epoch": 0.88, + "learning_rate": 6.864282388901543e-07, + "logits/chosen": -1.2562803030014038, + "logits/rejected": -1.2214447259902954, + "logps/chosen": -93.09823608398438, + "logps/rejected": -180.29898071289062, + "loss": 0.1201, + "rewards/accuracies": 0.9375, + "rewards/chosen": -2.6010587215423584, + "rewards/margins": 6.928165912628174, + "rewards/rejected": -9.52922534942627, + "step": 580 + }, + { + "epoch": 0.88, + "learning_rate": 6.851965666091992e-07, + "logits/chosen": -1.1490026712417603, + "logits/rejected": -1.1077204942703247, + "logps/chosen": -71.9451904296875, + "logps/rejected": -160.8985595703125, + "loss": 0.13, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.9476263523101807, + "rewards/margins": 6.7807416915893555, + "rewards/rejected": -8.72836971282959, + "step": 581 + }, + { + "epoch": 0.88, + "learning_rate": 6.839635904740845e-07, + "logits/chosen": -1.1142712831497192, + "logits/rejected": -1.0776047706604004, + "logps/chosen": -75.88325500488281, + "logps/rejected": -142.70265197753906, + "loss": 0.0962, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.5644887685775757, + "rewards/margins": 5.492844581604004, + "rewards/rejected": -7.057333469390869, + "step": 582 + }, + { + "epoch": 0.89, + "learning_rate": 6.827293191654308e-07, + "logits/chosen": -1.191672921180725, + "logits/rejected": -1.1895473003387451, + "logps/chosen": -63.94920349121094, + "logps/rejected": -106.04363250732422, + "loss": 0.1393, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.49468994140625, + "rewards/margins": 3.7244133949279785, + "rewards/rejected": -5.21910285949707, + "step": 583 + }, + { + "epoch": 0.89, + "learning_rate": 6.814937613729765e-07, + "logits/chosen": -1.0876585245132446, + "logits/rejected": -1.0104507207870483, + "logps/chosen": -69.12646484375, + "logps/rejected": -184.89944458007812, + "loss": 0.128, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.6190570592880249, + "rewards/margins": 8.769519805908203, + "rewards/rejected": -9.388575553894043, + "step": 584 + }, + { + "epoch": 0.89, + "learning_rate": 6.80256925795518e-07, + "logits/chosen": -1.4268258810043335, + "logits/rejected": -1.5291452407836914, + "logps/chosen": -67.29776763916016, + "logps/rejected": -103.68289947509766, + "loss": 0.1222, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.23457276821136475, + "rewards/margins": 4.427850723266602, + "rewards/rejected": -4.662423610687256, + "step": 585 + }, + { + "epoch": 0.89, + "learning_rate": 6.790188211408471e-07, + "logits/chosen": -1.3198050260543823, + "logits/rejected": -1.3658562898635864, + "logps/chosen": -61.914459228515625, + "logps/rejected": -129.01361083984375, + "loss": 0.1319, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.0598723888397217, + "rewards/margins": 6.038536071777344, + "rewards/rejected": -7.0984086990356445, + "step": 586 + }, + { + "epoch": 0.89, + "learning_rate": 6.777794561256913e-07, + "logits/chosen": -1.0327891111373901, + "logits/rejected": -1.0176843404769897, + "logps/chosen": -104.78642272949219, + "logps/rejected": -210.1189422607422, + "loss": 0.1325, + "rewards/accuracies": 0.875, + "rewards/chosen": -2.38815975189209, + "rewards/margins": 9.216400146484375, + "rewards/rejected": -11.604558944702148, + "step": 587 + }, + { + "epoch": 0.89, + "learning_rate": 6.765388394756504e-07, + "logits/chosen": -1.4385006427764893, + "logits/rejected": -1.4017232656478882, + "logps/chosen": -69.59347534179688, + "logps/rejected": -148.25234985351562, + "loss": 0.1039, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.3803633451461792, + "rewards/margins": 6.397822380065918, + "rewards/rejected": -6.7781853675842285, + "step": 588 + }, + { + "epoch": 0.89, + "learning_rate": 6.75296979925137e-07, + "logits/chosen": -1.3260376453399658, + "logits/rejected": -1.3459455966949463, + "logps/chosen": -75.15542602539062, + "logps/rejected": -171.506103515625, + "loss": 0.1724, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.2072641849517822, + "rewards/margins": 7.993558883666992, + "rewards/rejected": -9.200822830200195, + "step": 589 + }, + { + "epoch": 0.9, + "learning_rate": 6.740538862173139e-07, + "logits/chosen": -0.9990830421447754, + "logits/rejected": -0.9321877956390381, + "logps/chosen": -80.32003021240234, + "logps/rejected": -161.963134765625, + "loss": 0.0994, + "rewards/accuracies": 0.9375, + "rewards/chosen": -2.020470142364502, + "rewards/margins": 6.135712623596191, + "rewards/rejected": -8.156183242797852, + "step": 590 + }, + { + "epoch": 0.9, + "learning_rate": 6.728095671040329e-07, + "logits/chosen": -1.2907447814941406, + "logits/rejected": -1.4420808553695679, + "logps/chosen": -89.08474731445312, + "logps/rejected": -205.51153564453125, + "loss": 0.1139, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.2016139030456543, + "rewards/margins": 10.155261993408203, + "rewards/rejected": -11.356876373291016, + "step": 591 + }, + { + "epoch": 0.9, + "learning_rate": 6.715640313457732e-07, + "logits/chosen": -1.2335330247879028, + "logits/rejected": -1.212240219116211, + "logps/chosen": -75.88554382324219, + "logps/rejected": -139.1956787109375, + "loss": 0.1957, + "rewards/accuracies": 0.9375, + "rewards/chosen": -2.228327751159668, + "rewards/margins": 5.448575496673584, + "rewards/rejected": -7.676904201507568, + "step": 592 + }, + { + "epoch": 0.9, + "learning_rate": 6.703172877115793e-07, + "logits/chosen": -1.1978495121002197, + "logits/rejected": -1.222267746925354, + "logps/chosen": -78.69413757324219, + "logps/rejected": -159.76504516601562, + "loss": 0.2183, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.261566162109375, + "rewards/margins": 7.104775428771973, + "rewards/rejected": -8.366341590881348, + "step": 593 + }, + { + "epoch": 0.9, + "learning_rate": 6.690693449790001e-07, + "logits/chosen": -0.9641166925430298, + "logits/rejected": -0.947120189666748, + "logps/chosen": -66.4343032836914, + "logps/rejected": -141.80267333984375, + "loss": 0.0958, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.8348309397697449, + "rewards/margins": 6.30582332611084, + "rewards/rejected": -7.140653610229492, + "step": 594 + }, + { + "epoch": 0.9, + "learning_rate": 6.678202119340262e-07, + "logits/chosen": -1.1618077754974365, + "logits/rejected": -1.1629685163497925, + "logps/chosen": -53.093605041503906, + "logps/rejected": -144.32376098632812, + "loss": 0.129, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.05835476517677307, + "rewards/margins": 7.923296928405762, + "rewards/rejected": -7.8649420738220215, + "step": 595 + }, + { + "epoch": 0.91, + "learning_rate": 6.665698973710288e-07, + "logits/chosen": -1.1058599948883057, + "logits/rejected": -1.0854295492172241, + "logps/chosen": -49.76495361328125, + "logps/rejected": -157.39927673339844, + "loss": 0.0839, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.4173562824726105, + "rewards/margins": 9.607122421264648, + "rewards/rejected": -9.189765930175781, + "step": 596 + }, + { + "epoch": 0.91, + "learning_rate": 6.653184100926969e-07, + "logits/chosen": -1.0652377605438232, + "logits/rejected": -1.038432240486145, + "logps/chosen": -65.50021362304688, + "logps/rejected": -153.48757934570312, + "loss": 0.1159, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.08607436716556549, + "rewards/margins": 6.7374043464660645, + "rewards/rejected": -6.823478698730469, + "step": 597 + }, + { + "epoch": 0.91, + "learning_rate": 6.640657589099767e-07, + "logits/chosen": -1.2282792329788208, + "logits/rejected": -1.1877415180206299, + "logps/chosen": -64.80270385742188, + "logps/rejected": -132.14723205566406, + "loss": 0.0669, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.5253083109855652, + "rewards/margins": 5.846732139587402, + "rewards/rejected": -6.372040271759033, + "step": 598 + }, + { + "epoch": 0.91, + "learning_rate": 6.628119526420078e-07, + "logits/chosen": -1.1812118291854858, + "logits/rejected": -1.1249544620513916, + "logps/chosen": -69.10399627685547, + "logps/rejected": -144.18438720703125, + "loss": 0.1138, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.5147157907485962, + "rewards/margins": 6.449079990386963, + "rewards/rejected": -6.963795185089111, + "step": 599 + }, + { + "epoch": 0.91, + "learning_rate": 6.615570001160625e-07, + "logits/chosen": -1.1095051765441895, + "logits/rejected": -1.180436611175537, + "logps/chosen": -53.987178802490234, + "logps/rejected": -113.84768676757812, + "loss": 0.1409, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.4846702218055725, + "rewards/margins": 5.244781970977783, + "rewards/rejected": -5.729451656341553, + "step": 600 + }, + { + "epoch": 0.91, + "learning_rate": 6.603009101674835e-07, + "logits/chosen": -1.1934107542037964, + "logits/rejected": -1.3291200399398804, + "logps/chosen": -63.650306701660156, + "logps/rejected": -107.93333435058594, + "loss": 0.0963, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.12526728212833405, + "rewards/margins": 4.8406572341918945, + "rewards/rejected": -4.9659247398376465, + "step": 601 + }, + { + "epoch": 0.91, + "learning_rate": 6.590436916396207e-07, + "logits/chosen": -1.046741008758545, + "logits/rejected": -1.0239633321762085, + "logps/chosen": -84.33003234863281, + "logps/rejected": -186.45541381835938, + "loss": 0.1105, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.0560230016708374, + "rewards/margins": 8.320999145507812, + "rewards/rejected": -9.377021789550781, + "step": 602 + }, + { + "epoch": 0.92, + "learning_rate": 6.577853533837703e-07, + "logits/chosen": -1.112562656402588, + "logits/rejected": -1.0928502082824707, + "logps/chosen": -33.96232223510742, + "logps/rejected": -91.48005676269531, + "loss": 0.108, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.8726045489311218, + "rewards/margins": 5.010224342346191, + "rewards/rejected": -4.137619972229004, + "step": 603 + }, + { + "epoch": 0.92, + "learning_rate": 6.565259042591111e-07, + "logits/chosen": -1.0827683210372925, + "logits/rejected": -1.0390585660934448, + "logps/chosen": -45.63076400756836, + "logps/rejected": -153.36795043945312, + "loss": 0.0915, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.4685389995574951, + "rewards/margins": 7.915752410888672, + "rewards/rejected": -7.447213649749756, + "step": 604 + }, + { + "epoch": 0.92, + "learning_rate": 6.552653531326436e-07, + "logits/chosen": -1.059299349784851, + "logits/rejected": -1.070773959159851, + "logps/chosen": -40.399654388427734, + "logps/rejected": -85.68155670166016, + "loss": 0.0989, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.15366578102111816, + "rewards/margins": 3.8557958602905273, + "rewards/rejected": -3.70212984085083, + "step": 605 + }, + { + "epoch": 0.92, + "learning_rate": 6.540037088791263e-07, + "logits/chosen": -1.1014400720596313, + "logits/rejected": -1.147748589515686, + "logps/chosen": -55.5066032409668, + "logps/rejected": -123.78807067871094, + "loss": 0.0967, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.33187153935432434, + "rewards/margins": 6.414957523345947, + "rewards/rejected": -6.083086967468262, + "step": 606 + }, + { + "epoch": 0.92, + "learning_rate": 6.527409803810136e-07, + "logits/chosen": -1.2319551706314087, + "logits/rejected": -1.2486858367919922, + "logps/chosen": -89.27584838867188, + "logps/rejected": -172.9617919921875, + "loss": 0.1595, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.6647179126739502, + "rewards/margins": 6.800714015960693, + "rewards/rejected": -8.465432167053223, + "step": 607 + }, + { + "epoch": 0.92, + "learning_rate": 6.514771765283942e-07, + "logits/chosen": -1.3046828508377075, + "logits/rejected": -1.3457309007644653, + "logps/chosen": -59.46671676635742, + "logps/rejected": -164.77854919433594, + "loss": 0.1361, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.5135937929153442, + "rewards/margins": 8.90949821472168, + "rewards/rejected": -8.395904541015625, + "step": 608 + }, + { + "epoch": 0.93, + "learning_rate": 6.502123062189268e-07, + "logits/chosen": -1.025585412979126, + "logits/rejected": -0.9563760161399841, + "logps/chosen": -65.9186019897461, + "logps/rejected": -135.0490264892578, + "loss": 0.1325, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.41695117950439453, + "rewards/margins": 6.163644790649414, + "rewards/rejected": -6.580595970153809, + "step": 609 + }, + { + "epoch": 0.93, + "learning_rate": 6.489463783577786e-07, + "logits/chosen": -1.0755833387374878, + "logits/rejected": -1.0082464218139648, + "logps/chosen": -54.03449630737305, + "logps/rejected": -155.23272705078125, + "loss": 0.2115, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.22527417540550232, + "rewards/margins": 7.68045711517334, + "rewards/rejected": -7.4551825523376465, + "step": 610 + }, + { + "epoch": 0.93, + "learning_rate": 6.476794018575629e-07, + "logits/chosen": -0.8818612694740295, + "logits/rejected": -0.8964678645133972, + "logps/chosen": -69.8212890625, + "logps/rejected": -94.2854995727539, + "loss": 0.1577, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.6802802085876465, + "rewards/margins": 3.0808987617492676, + "rewards/rejected": -4.761178970336914, + "step": 611 + }, + { + "epoch": 0.93, + "learning_rate": 6.464113856382751e-07, + "logits/chosen": -1.3570644855499268, + "logits/rejected": -1.4353338479995728, + "logps/chosen": -44.93012619018555, + "logps/rejected": -113.91537475585938, + "loss": 0.123, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.6995503902435303, + "rewards/margins": 6.318906784057617, + "rewards/rejected": -5.619356155395508, + "step": 612 + }, + { + "epoch": 0.93, + "learning_rate": 6.451423386272311e-07, + "logits/chosen": -1.081716537475586, + "logits/rejected": -1.080958366394043, + "logps/chosen": -66.42369842529297, + "logps/rejected": -146.11070251464844, + "loss": 0.1011, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.007286667823791504, + "rewards/margins": 7.636693000793457, + "rewards/rejected": -7.629406929016113, + "step": 613 + }, + { + "epoch": 0.93, + "learning_rate": 6.438722697590038e-07, + "logits/chosen": -1.254135012626648, + "logits/rejected": -1.28207266330719, + "logps/chosen": -56.42732238769531, + "logps/rejected": -133.7398223876953, + "loss": 0.1432, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.33388689160346985, + "rewards/margins": 6.535743236541748, + "rewards/rejected": -6.869630336761475, + "step": 614 + }, + { + "epoch": 0.93, + "learning_rate": 6.426011879753601e-07, + "logits/chosen": -1.1966629028320312, + "logits/rejected": -1.195890188217163, + "logps/chosen": -70.52161407470703, + "logps/rejected": -149.12550354003906, + "loss": 0.1196, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.7128872871398926, + "rewards/margins": 6.004786968231201, + "rewards/rejected": -6.717674255371094, + "step": 615 + }, + { + "epoch": 0.94, + "learning_rate": 6.413291022251989e-07, + "logits/chosen": -1.1653528213500977, + "logits/rejected": -1.158420205116272, + "logps/chosen": -74.13883972167969, + "logps/rejected": -167.26708984375, + "loss": 0.1286, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.6810168623924255, + "rewards/margins": 8.279891967773438, + "rewards/rejected": -8.960908889770508, + "step": 616 + }, + { + "epoch": 0.94, + "learning_rate": 6.400560214644864e-07, + "logits/chosen": -0.7961425185203552, + "logits/rejected": -0.6948035359382629, + "logps/chosen": -31.742719650268555, + "logps/rejected": -98.01490783691406, + "loss": 0.1302, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.42019566893577576, + "rewards/margins": 5.272375106811523, + "rewards/rejected": -4.852179527282715, + "step": 617 + }, + { + "epoch": 0.94, + "learning_rate": 6.387819546561953e-07, + "logits/chosen": -1.3594671487808228, + "logits/rejected": -1.3466832637786865, + "logps/chosen": -79.9526596069336, + "logps/rejected": -133.43017578125, + "loss": 0.1571, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.4597373008728027, + "rewards/margins": 4.369678974151611, + "rewards/rejected": -5.829415798187256, + "step": 618 + }, + { + "epoch": 0.94, + "learning_rate": 6.375069107702392e-07, + "logits/chosen": -1.2339191436767578, + "logits/rejected": -1.30702543258667, + "logps/chosen": -69.71017456054688, + "logps/rejected": -168.69784545898438, + "loss": 0.079, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.4460238814353943, + "rewards/margins": 8.786514282226562, + "rewards/rejected": -9.232538223266602, + "step": 619 + }, + { + "epoch": 0.94, + "learning_rate": 6.362308987834115e-07, + "logits/chosen": -1.2344510555267334, + "logits/rejected": -1.2518203258514404, + "logps/chosen": -85.0146484375, + "logps/rejected": -191.4388885498047, + "loss": 0.0993, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.8330772519111633, + "rewards/margins": 9.515462875366211, + "rewards/rejected": -10.348539352416992, + "step": 620 + }, + { + "epoch": 0.94, + "learning_rate": 6.349539276793211e-07, + "logits/chosen": -1.0936007499694824, + "logits/rejected": -1.0148943662643433, + "logps/chosen": -62.80587387084961, + "logps/rejected": -144.54388427734375, + "loss": 0.1469, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.3292310833930969, + "rewards/margins": 6.688003063201904, + "rewards/rejected": -7.017233848571777, + "step": 621 + }, + { + "epoch": 0.94, + "learning_rate": 6.336760064483295e-07, + "logits/chosen": -1.2999571561813354, + "logits/rejected": -1.261387586593628, + "logps/chosen": -85.21759033203125, + "logps/rejected": -180.78350830078125, + "loss": 0.0991, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.6249150037765503, + "rewards/margins": 8.264779090881348, + "rewards/rejected": -9.889694213867188, + "step": 622 + }, + { + "epoch": 0.95, + "learning_rate": 6.323971440874877e-07, + "logits/chosen": -1.0580947399139404, + "logits/rejected": -1.1377376317977905, + "logps/chosen": -76.76827239990234, + "logps/rejected": -186.6950225830078, + "loss": 0.1408, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.8835163116455078, + "rewards/margins": 9.402647972106934, + "rewards/rejected": -10.286163330078125, + "step": 623 + }, + { + "epoch": 0.95, + "learning_rate": 6.311173496004723e-07, + "logits/chosen": -1.0193018913269043, + "logits/rejected": -1.020062804222107, + "logps/chosen": -67.74279022216797, + "logps/rejected": -119.82437133789062, + "loss": 0.1549, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.1380265951156616, + "rewards/margins": 4.763696670532227, + "rewards/rejected": -5.901723384857178, + "step": 624 + }, + { + "epoch": 0.95, + "learning_rate": 6.298366319975221e-07, + "logits/chosen": -0.9675927758216858, + "logits/rejected": -0.9189658164978027, + "logps/chosen": -66.28350067138672, + "logps/rejected": -123.21947479248047, + "loss": 0.1642, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.3978164196014404, + "rewards/margins": 4.934099197387695, + "rewards/rejected": -6.331915378570557, + "step": 625 + }, + { + "epoch": 0.95, + "learning_rate": 6.28555000295376e-07, + "logits/chosen": -1.0939494371414185, + "logits/rejected": -1.1183608770370483, + "logps/chosen": -74.27719116210938, + "logps/rejected": -154.91586303710938, + "loss": 0.1485, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.6623119711875916, + "rewards/margins": 7.408330917358398, + "rewards/rejected": -8.07064151763916, + "step": 626 + }, + { + "epoch": 0.95, + "learning_rate": 6.272724635172074e-07, + "logits/chosen": -0.9520808458328247, + "logits/rejected": -0.8811392784118652, + "logps/chosen": -70.15853118896484, + "logps/rejected": -183.83602905273438, + "loss": 0.1547, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.32549893856048584, + "rewards/margins": 9.788957595825195, + "rewards/rejected": -10.114457130432129, + "step": 627 + }, + { + "epoch": 0.95, + "learning_rate": 6.259890306925626e-07, + "logits/chosen": -0.9270368814468384, + "logits/rejected": -0.8135693073272705, + "logps/chosen": -63.346412658691406, + "logps/rejected": -158.90957641601562, + "loss": 0.1442, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.1709436178207397, + "rewards/margins": 7.03597354888916, + "rewards/rejected": -8.206917762756348, + "step": 628 + }, + { + "epoch": 0.96, + "learning_rate": 6.247047108572959e-07, + "logits/chosen": -1.070979118347168, + "logits/rejected": -0.8744301199913025, + "logps/chosen": -69.09797668457031, + "logps/rejected": -215.55731201171875, + "loss": 0.0971, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.5460328459739685, + "rewards/margins": 10.441033363342285, + "rewards/rejected": -10.987065315246582, + "step": 629 + }, + { + "epoch": 0.96, + "learning_rate": 6.234195130535068e-07, + "logits/chosen": -1.290129542350769, + "logits/rejected": -1.2997944355010986, + "logps/chosen": -62.63139724731445, + "logps/rejected": -130.76480102539062, + "loss": 0.0977, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.8450179100036621, + "rewards/margins": 5.818835735321045, + "rewards/rejected": -6.663854598999023, + "step": 630 + }, + { + "epoch": 0.96, + "learning_rate": 6.221334463294759e-07, + "logits/chosen": -1.1602667570114136, + "logits/rejected": -1.1095179319381714, + "logps/chosen": -64.07215881347656, + "logps/rejected": -114.46161651611328, + "loss": 0.0942, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.4669833779335022, + "rewards/margins": 5.138500690460205, + "rewards/rejected": -5.605484485626221, + "step": 631 + }, + { + "epoch": 0.96, + "learning_rate": 6.208465197396012e-07, + "logits/chosen": -1.2838671207427979, + "logits/rejected": -1.2515268325805664, + "logps/chosen": -93.52930450439453, + "logps/rejected": -193.46759033203125, + "loss": 0.1292, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.9162299633026123, + "rewards/margins": 7.588293075561523, + "rewards/rejected": -9.504523277282715, + "step": 632 + }, + { + "epoch": 0.96, + "learning_rate": 6.195587423443348e-07, + "logits/chosen": -1.1156113147735596, + "logits/rejected": -1.0549241304397583, + "logps/chosen": -51.68001174926758, + "logps/rejected": -130.57142639160156, + "loss": 0.0914, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.12324819713830948, + "rewards/margins": 6.490993976593018, + "rewards/rejected": -6.614241600036621, + "step": 633 + }, + { + "epoch": 0.96, + "learning_rate": 6.182701232101184e-07, + "logits/chosen": -0.997816264629364, + "logits/rejected": -0.973334550857544, + "logps/chosen": -62.83354568481445, + "logps/rejected": -147.64317321777344, + "loss": 0.1306, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.7381342053413391, + "rewards/margins": 7.281171798706055, + "rewards/rejected": -8.019306182861328, + "step": 634 + }, + { + "epoch": 0.96, + "learning_rate": 6.169806714093203e-07, + "logits/chosen": -1.2990257740020752, + "logits/rejected": -1.242817997932434, + "logps/chosen": -93.56485748291016, + "logps/rejected": -203.56536865234375, + "loss": 0.1267, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.3125959634780884, + "rewards/margins": 8.971162796020508, + "rewards/rejected": -10.283758163452148, + "step": 635 + }, + { + "epoch": 0.97, + "learning_rate": 6.156903960201708e-07, + "logits/chosen": -1.0316754579544067, + "logits/rejected": -0.9951988458633423, + "logps/chosen": -89.90121459960938, + "logps/rejected": -209.39590454101562, + "loss": 0.1043, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.4963710308074951, + "rewards/margins": 9.438202857971191, + "rewards/rejected": -10.93457317352295, + "step": 636 + }, + { + "epoch": 0.97, + "learning_rate": 6.143993061266985e-07, + "logits/chosen": -0.9450584053993225, + "logits/rejected": -0.9507660865783691, + "logps/chosen": -61.39887619018555, + "logps/rejected": -100.94102478027344, + "loss": 0.1086, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.806435465812683, + "rewards/margins": 3.4823813438415527, + "rewards/rejected": -5.288816928863525, + "step": 637 + }, + { + "epoch": 0.97, + "learning_rate": 6.131074108186665e-07, + "logits/chosen": -1.2073445320129395, + "logits/rejected": -1.1909887790679932, + "logps/chosen": -75.20411682128906, + "logps/rejected": -137.83969116210938, + "loss": 0.224, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.8708873987197876, + "rewards/margins": 5.583239555358887, + "rewards/rejected": -6.454126834869385, + "step": 638 + }, + { + "epoch": 0.97, + "learning_rate": 6.118147191915087e-07, + "logits/chosen": -1.032888412475586, + "logits/rejected": -0.9318363070487976, + "logps/chosen": -87.4946517944336, + "logps/rejected": -209.93983459472656, + "loss": 0.1081, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.2013015747070312, + "rewards/margins": 9.4359712600708, + "rewards/rejected": -11.637272834777832, + "step": 639 + }, + { + "epoch": 0.97, + "learning_rate": 6.105212403462649e-07, + "logits/chosen": -0.9409430027008057, + "logits/rejected": -0.8462395668029785, + "logps/chosen": -74.47772979736328, + "logps/rejected": -165.8389129638672, + "loss": 0.096, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.5092302560806274, + "rewards/margins": 7.441671848297119, + "rewards/rejected": -8.95090103149414, + "step": 640 + }, + { + "epoch": 0.97, + "learning_rate": 6.092269833895174e-07, + "logits/chosen": -1.1518549919128418, + "logits/rejected": -1.1024231910705566, + "logps/chosen": -101.31788635253906, + "logps/rejected": -186.49029541015625, + "loss": 0.1025, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.271648645401001, + "rewards/margins": 7.782963752746582, + "rewards/rejected": -10.054612159729004, + "step": 641 + }, + { + "epoch": 0.98, + "learning_rate": 6.079319574333266e-07, + "logits/chosen": -1.2157964706420898, + "logits/rejected": -1.2053083181381226, + "logps/chosen": -74.29100799560547, + "logps/rejected": -162.5016326904297, + "loss": 0.1264, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.9477126598358154, + "rewards/margins": 7.87605619430542, + "rewards/rejected": -8.823769569396973, + "step": 642 + }, + { + "epoch": 0.98, + "learning_rate": 6.06636171595167e-07, + "logits/chosen": -1.2841458320617676, + "logits/rejected": -1.2092390060424805, + "logps/chosen": -56.44172668457031, + "logps/rejected": -134.9115447998047, + "loss": 0.0907, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.044044479727745056, + "rewards/margins": 6.794293403625488, + "rewards/rejected": -6.8383378982543945, + "step": 643 + }, + { + "epoch": 0.98, + "learning_rate": 6.053396349978631e-07, + "logits/chosen": -1.1039835214614868, + "logits/rejected": -1.0356587171554565, + "logps/chosen": -76.9747543334961, + "logps/rejected": -172.63150024414062, + "loss": 0.1098, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.146126389503479, + "rewards/margins": 7.621996879577637, + "rewards/rejected": -8.768123626708984, + "step": 644 + }, + { + "epoch": 0.98, + "learning_rate": 6.04042356769525e-07, + "logits/chosen": -1.264845609664917, + "logits/rejected": -1.2502721548080444, + "logps/chosen": -85.20533752441406, + "logps/rejected": -194.13870239257812, + "loss": 0.1003, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.8115068674087524, + "rewards/margins": 9.807979583740234, + "rewards/rejected": -10.619485855102539, + "step": 645 + }, + { + "epoch": 0.98, + "learning_rate": 6.02744346043484e-07, + "logits/chosen": -1.2147037982940674, + "logits/rejected": -1.1134579181671143, + "logps/chosen": -87.7901382446289, + "logps/rejected": -193.6415252685547, + "loss": 0.1185, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.5775516033172607, + "rewards/margins": 8.363851547241211, + "rewards/rejected": -9.941402435302734, + "step": 646 + }, + { + "epoch": 0.98, + "learning_rate": 6.014456119582284e-07, + "logits/chosen": -1.2615370750427246, + "logits/rejected": -1.1352978944778442, + "logps/chosen": -86.64071655273438, + "logps/rejected": -170.6793975830078, + "loss": 0.1168, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.381377100944519, + "rewards/margins": 7.180846214294434, + "rewards/rejected": -8.562223434448242, + "step": 647 + }, + { + "epoch": 0.98, + "learning_rate": 6.001461636573396e-07, + "logits/chosen": -1.0515718460083008, + "logits/rejected": -0.9838770031929016, + "logps/chosen": -59.810577392578125, + "logps/rejected": -126.24922180175781, + "loss": 0.0817, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.5042865872383118, + "rewards/margins": 5.515542984008789, + "rewards/rejected": -6.019829750061035, + "step": 648 + }, + { + "epoch": 0.99, + "learning_rate": 5.98846010289427e-07, + "logits/chosen": -1.1093281507492065, + "logits/rejected": -1.077587604522705, + "logps/chosen": -64.88356018066406, + "logps/rejected": -154.37062072753906, + "loss": 0.0841, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.40832287073135376, + "rewards/margins": 7.622497081756592, + "rewards/rejected": -8.030820846557617, + "step": 649 + }, + { + "epoch": 0.99, + "learning_rate": 5.975451610080642e-07, + "logits/chosen": -0.8989431858062744, + "logits/rejected": -0.9193077087402344, + "logps/chosen": -54.4132080078125, + "logps/rejected": -117.14277648925781, + "loss": 0.1351, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.4165533185005188, + "rewards/margins": 5.752285003662109, + "rewards/rejected": -6.1688385009765625, + "step": 650 + }, + { + "epoch": 0.99, + "learning_rate": 5.962436249717239e-07, + "logits/chosen": -1.2257202863693237, + "logits/rejected": -1.159430980682373, + "logps/chosen": -69.6572036743164, + "logps/rejected": -169.44932556152344, + "loss": 0.098, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.28377628326416016, + "rewards/margins": 8.203788757324219, + "rewards/rejected": -7.920012474060059, + "step": 651 + }, + { + "epoch": 0.99, + "learning_rate": 5.949414113437141e-07, + "logits/chosen": -1.2154548168182373, + "logits/rejected": -1.2897298336029053, + "logps/chosen": -54.942771911621094, + "logps/rejected": -133.26797485351562, + "loss": 0.0998, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.29021626710891724, + "rewards/margins": 6.945834159851074, + "rewards/rejected": -7.236050128936768, + "step": 652 + }, + { + "epoch": 0.99, + "learning_rate": 5.936385292921135e-07, + "logits/chosen": -1.2028741836547852, + "logits/rejected": -1.119994044303894, + "logps/chosen": -69.02983093261719, + "logps/rejected": -143.38055419921875, + "loss": 0.1776, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.5186907052993774, + "rewards/margins": 4.823304653167725, + "rewards/rejected": -6.3419952392578125, + "step": 653 + }, + { + "epoch": 0.99, + "learning_rate": 5.923349879897064e-07, + "logits/chosen": -1.2728404998779297, + "logits/rejected": -1.2099133729934692, + "logps/chosen": -61.5665283203125, + "logps/rejected": -172.33697509765625, + "loss": 0.1038, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.6831641793251038, + "rewards/margins": 8.591301918029785, + "rewards/rejected": -9.274466514587402, + "step": 654 + }, + { + "epoch": 1.0, + "learning_rate": 5.910307966139186e-07, + "logits/chosen": -1.142350196838379, + "logits/rejected": -1.0968624353408813, + "logps/chosen": -64.24427795410156, + "logps/rejected": -144.29763793945312, + "loss": 0.0897, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.833078145980835, + "rewards/margins": 6.916269302368164, + "rewards/rejected": -7.74934720993042, + "step": 655 + }, + { + "epoch": 1.0, + "learning_rate": 5.897259643467527e-07, + "logits/chosen": -1.0089267492294312, + "logits/rejected": -0.9968331456184387, + "logps/chosen": -54.45252990722656, + "logps/rejected": -102.59598541259766, + "loss": 0.1371, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.8583024740219116, + "rewards/margins": 4.281520843505859, + "rewards/rejected": -5.139822959899902, + "step": 656 + }, + { + "epoch": 1.0, + "learning_rate": 5.884205003747232e-07, + "logits/chosen": -1.1683781147003174, + "logits/rejected": -1.1888394355773926, + "logps/chosen": -62.79545593261719, + "logps/rejected": -126.7721176147461, + "loss": 0.1603, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.5220464468002319, + "rewards/margins": 6.045437812805176, + "rewards/rejected": -6.567483901977539, + "step": 657 + }, + { + "epoch": 1.0, + "learning_rate": 5.871144138887925e-07, + "logits/chosen": -1.076742172241211, + "logits/rejected": -1.0606688261032104, + "logps/chosen": -65.71561431884766, + "logps/rejected": -129.7261962890625, + "loss": 0.131, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.1118013858795166, + "rewards/margins": 6.00579833984375, + "rewards/rejected": -7.117599964141846, + "step": 658 + }, + { + "epoch": 1.0, + "learning_rate": 5.858077140843052e-07, + "logits/chosen": -1.158136010169983, + "logits/rejected": -1.1475896835327148, + "logps/chosen": -60.0682258605957, + "logps/rejected": -131.699462890625, + "loss": 0.1057, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.2798491418361664, + "rewards/margins": 6.119170665740967, + "rewards/rejected": -6.399019718170166, + "step": 659 + }, + { + "epoch": 1.0, + "learning_rate": 5.845004101609246e-07, + "logits/chosen": -1.4042613506317139, + "logits/rejected": -1.4403735399246216, + "logps/chosen": -67.27396392822266, + "logps/rejected": -124.99378967285156, + "loss": 0.0976, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.6702808141708374, + "rewards/margins": 5.292868614196777, + "rewards/rejected": -5.963149070739746, + "step": 660 + }, + { + "epoch": 1.0, + "learning_rate": 5.831925113225663e-07, + "logits/chosen": -1.1096900701522827, + "logits/rejected": -1.030463695526123, + "logps/chosen": -80.2137680053711, + "logps/rejected": -188.515869140625, + "loss": 0.0865, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.5864979028701782, + "rewards/margins": 8.952061653137207, + "rewards/rejected": -10.53856086730957, + "step": 661 + }, + { + "epoch": 1.01, + "learning_rate": 5.818840267773349e-07, + "logits/chosen": -0.9514877200126648, + "logits/rejected": -0.8386536836624146, + "logps/chosen": -54.629302978515625, + "logps/rejected": -147.75096130371094, + "loss": 0.1456, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.3283122777938843, + "rewards/margins": 7.612514972686768, + "rewards/rejected": -7.940827369689941, + "step": 662 + }, + { + "epoch": 1.01, + "learning_rate": 5.805749657374588e-07, + "logits/chosen": -1.1909164190292358, + "logits/rejected": -1.1824414730072021, + "logps/chosen": -63.21338653564453, + "logps/rejected": -125.61174011230469, + "loss": 0.0928, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.7514258623123169, + "rewards/margins": 5.9479804039001465, + "rewards/rejected": -6.699406623840332, + "step": 663 + }, + { + "epoch": 1.01, + "learning_rate": 5.792653374192245e-07, + "logits/chosen": -1.1508904695510864, + "logits/rejected": -1.3032901287078857, + "logps/chosen": -61.64646911621094, + "logps/rejected": -123.74299621582031, + "loss": 0.09, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.6384060382843018, + "rewards/margins": 6.525364875793457, + "rewards/rejected": -7.163771152496338, + "step": 664 + }, + { + "epoch": 1.01, + "learning_rate": 5.77955151042913e-07, + "logits/chosen": -1.2269688844680786, + "logits/rejected": -1.102317452430725, + "logps/chosen": -93.61415100097656, + "logps/rejected": -197.66168212890625, + "loss": 0.1308, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.865402102470398, + "rewards/margins": 8.771673202514648, + "rewards/rejected": -10.63707447052002, + "step": 665 + }, + { + "epoch": 1.01, + "learning_rate": 5.766444158327337e-07, + "logits/chosen": -1.2553646564483643, + "logits/rejected": -1.2647689580917358, + "logps/chosen": -69.70663452148438, + "logps/rejected": -166.05563354492188, + "loss": 0.0516, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.5620317459106445, + "rewards/margins": 7.381424903869629, + "rewards/rejected": -7.943456649780273, + "step": 666 + }, + { + "epoch": 1.01, + "learning_rate": 5.753331410167603e-07, + "logits/chosen": -1.1728168725967407, + "logits/rejected": -1.2360785007476807, + "logps/chosen": -57.126678466796875, + "logps/rejected": -115.8658218383789, + "loss": 0.0699, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.10996098071336746, + "rewards/margins": 6.259988307952881, + "rewards/rejected": -6.369948387145996, + "step": 667 + }, + { + "epoch": 1.01, + "learning_rate": 5.740213358268658e-07, + "logits/chosen": -0.9818707704544067, + "logits/rejected": -0.935701310634613, + "logps/chosen": -77.70005798339844, + "logps/rejected": -182.51943969726562, + "loss": 0.1275, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.9125512838363647, + "rewards/margins": 8.70441722869873, + "rewards/rejected": -9.616969108581543, + "step": 668 + }, + { + "epoch": 1.02, + "learning_rate": 5.727090094986565e-07, + "logits/chosen": -1.1261427402496338, + "logits/rejected": -1.0947426557540894, + "logps/chosen": -68.04671478271484, + "logps/rejected": -151.35067749023438, + "loss": 0.12, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.8326579928398132, + "rewards/margins": 7.375097274780273, + "rewards/rejected": -8.207755088806152, + "step": 669 + }, + { + "epoch": 1.02, + "learning_rate": 5.713961712714081e-07, + "logits/chosen": -0.9983550310134888, + "logits/rejected": -0.9126099944114685, + "logps/chosen": -63.41878128051758, + "logps/rejected": -154.33389282226562, + "loss": 0.0974, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.12092608213424683, + "rewards/margins": 7.515420913696289, + "rewards/rejected": -7.636347770690918, + "step": 670 + }, + { + "epoch": 1.02, + "learning_rate": 5.700828303880006e-07, + "logits/chosen": -1.217376470565796, + "logits/rejected": -1.2310144901275635, + "logps/chosen": -81.50133514404297, + "logps/rejected": -163.76443481445312, + "loss": 0.1043, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.1945643424987793, + "rewards/margins": 7.473373889923096, + "rewards/rejected": -8.667938232421875, + "step": 671 + }, + { + "epoch": 1.02, + "learning_rate": 5.687689960948525e-07, + "logits/chosen": -1.0253369808197021, + "logits/rejected": -1.0140191316604614, + "logps/chosen": -84.7760238647461, + "logps/rejected": -134.62786865234375, + "loss": 0.0932, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.9770020246505737, + "rewards/margins": 5.535608291625977, + "rewards/rejected": -6.51261043548584, + "step": 672 + }, + { + "epoch": 1.02, + "learning_rate": 5.674546776418559e-07, + "logits/chosen": -0.7614624500274658, + "logits/rejected": -0.7193310856819153, + "logps/chosen": -57.43425369262695, + "logps/rejected": -145.88856506347656, + "loss": 0.1625, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.0262550115585327, + "rewards/margins": 6.430345058441162, + "rewards/rejected": -7.456599712371826, + "step": 673 + }, + { + "epoch": 1.02, + "learning_rate": 5.661398842823121e-07, + "logits/chosen": -1.3408358097076416, + "logits/rejected": -1.267204761505127, + "logps/chosen": -42.15625, + "logps/rejected": -116.25086975097656, + "loss": 0.0826, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.09829363226890564, + "rewards/margins": 6.09843111038208, + "rewards/rejected": -6.0001373291015625, + "step": 674 + }, + { + "epoch": 1.03, + "learning_rate": 5.648246252728657e-07, + "logits/chosen": -1.1724097728729248, + "logits/rejected": -1.1310337781906128, + "logps/chosen": -55.03935623168945, + "logps/rejected": -119.53876495361328, + "loss": 0.0846, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.5357794165611267, + "rewards/margins": 6.028209209442139, + "rewards/rejected": -6.563988208770752, + "step": 675 + }, + { + "epoch": 1.03, + "learning_rate": 5.635089098734393e-07, + "logits/chosen": -1.2830966711044312, + "logits/rejected": -1.3221076726913452, + "logps/chosen": -68.17839813232422, + "logps/rejected": -150.27639770507812, + "loss": 0.1029, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.5815032720565796, + "rewards/margins": 7.11434268951416, + "rewards/rejected": -8.695844650268555, + "step": 676 + }, + { + "epoch": 1.03, + "learning_rate": 5.621927473471694e-07, + "logits/chosen": -0.9430112838745117, + "logits/rejected": -0.9096443057060242, + "logps/chosen": -58.878753662109375, + "logps/rejected": -130.36016845703125, + "loss": 0.1278, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.7708883881568909, + "rewards/margins": 5.786856174468994, + "rewards/rejected": -6.55774450302124, + "step": 677 + }, + { + "epoch": 1.03, + "learning_rate": 5.608761469603397e-07, + "logits/chosen": -0.8836669325828552, + "logits/rejected": -0.8740648031234741, + "logps/chosen": -52.28067398071289, + "logps/rejected": -135.568115234375, + "loss": 0.1107, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.21213576197624207, + "rewards/margins": 7.3774189949035645, + "rewards/rejected": -7.589554786682129, + "step": 678 + }, + { + "epoch": 1.03, + "learning_rate": 5.595591179823169e-07, + "logits/chosen": -1.1525969505310059, + "logits/rejected": -1.0391813516616821, + "logps/chosen": -84.83979797363281, + "logps/rejected": -174.16342163085938, + "loss": 0.0779, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.6824384927749634, + "rewards/margins": 6.232317924499512, + "rewards/rejected": -7.914756774902344, + "step": 679 + }, + { + "epoch": 1.03, + "learning_rate": 5.582416696854852e-07, + "logits/chosen": -0.9567781686782837, + "logits/rejected": -0.9765966534614563, + "logps/chosen": -63.5609245300293, + "logps/rejected": -121.71739196777344, + "loss": 0.1131, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.3712667226791382, + "rewards/margins": 5.644521713256836, + "rewards/rejected": -7.015789031982422, + "step": 680 + }, + { + "epoch": 1.03, + "learning_rate": 5.569238113451812e-07, + "logits/chosen": -1.154358148574829, + "logits/rejected": -1.1891201734542847, + "logps/chosen": -89.4549560546875, + "logps/rejected": -149.6629180908203, + "loss": 0.082, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.4142396450042725, + "rewards/margins": 5.829531669616699, + "rewards/rejected": -7.243770599365234, + "step": 681 + }, + { + "epoch": 1.04, + "learning_rate": 5.556055522396278e-07, + "logits/chosen": -0.9599528908729553, + "logits/rejected": -0.918071985244751, + "logps/chosen": -47.804283142089844, + "logps/rejected": -113.82530975341797, + "loss": 0.0802, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.1189282089471817, + "rewards/margins": 5.937498569488525, + "rewards/rejected": -6.056427001953125, + "step": 682 + }, + { + "epoch": 1.04, + "learning_rate": 5.542869016498698e-07, + "logits/chosen": -1.0925543308258057, + "logits/rejected": -1.1219251155853271, + "logps/chosen": -81.27593231201172, + "logps/rejected": -157.37948608398438, + "loss": 0.0534, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.6112722158432007, + "rewards/margins": 6.5523152351379395, + "rewards/rejected": -8.16358757019043, + "step": 683 + }, + { + "epoch": 1.04, + "learning_rate": 5.52967868859708e-07, + "logits/chosen": -1.1082345247268677, + "logits/rejected": -1.1396856307983398, + "logps/chosen": -62.943992614746094, + "logps/rejected": -148.8440399169922, + "loss": 0.1339, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.6015743613243103, + "rewards/margins": 8.0671968460083, + "rewards/rejected": -8.668770790100098, + "step": 684 + }, + { + "epoch": 1.04, + "learning_rate": 5.516484631556344e-07, + "logits/chosen": -1.0701913833618164, + "logits/rejected": -1.1307215690612793, + "logps/chosen": -72.89879608154297, + "logps/rejected": -143.15277099609375, + "loss": 0.1164, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.9773444533348083, + "rewards/margins": 6.877935886383057, + "rewards/rejected": -7.85528039932251, + "step": 685 + }, + { + "epoch": 1.04, + "learning_rate": 5.50328693826766e-07, + "logits/chosen": -0.9838832020759583, + "logits/rejected": -0.962506115436554, + "logps/chosen": -73.59190368652344, + "logps/rejected": -140.7711181640625, + "loss": 0.1997, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.6346116065979004, + "rewards/margins": 6.033596038818359, + "rewards/rejected": -7.66820764541626, + "step": 686 + }, + { + "epoch": 1.04, + "learning_rate": 5.490085701647804e-07, + "logits/chosen": -0.9443256855010986, + "logits/rejected": -0.8821437358856201, + "logps/chosen": -71.37468719482422, + "logps/rejected": -148.31710815429688, + "loss": 0.1297, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.4407132863998413, + "rewards/margins": 6.951409339904785, + "rewards/rejected": -8.392123222351074, + "step": 687 + }, + { + "epoch": 1.05, + "learning_rate": 5.47688101463849e-07, + "logits/chosen": -1.0126395225524902, + "logits/rejected": -0.9197251200675964, + "logps/chosen": -75.32856750488281, + "logps/rejected": -186.19622802734375, + "loss": 0.1014, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.0315072536468506, + "rewards/margins": 9.135555267333984, + "rewards/rejected": -10.16706371307373, + "step": 688 + }, + { + "epoch": 1.05, + "learning_rate": 5.463672970205733e-07, + "logits/chosen": -0.8753904104232788, + "logits/rejected": -0.926545262336731, + "logps/chosen": -58.62400817871094, + "logps/rejected": -125.887451171875, + "loss": 0.099, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.2632928490638733, + "rewards/margins": 6.818629741668701, + "rewards/rejected": -7.08192253112793, + "step": 689 + }, + { + "epoch": 1.05, + "learning_rate": 5.450461661339182e-07, + "logits/chosen": -1.041439175605774, + "logits/rejected": -0.9572014808654785, + "logps/chosen": -69.90070343017578, + "logps/rejected": -182.95445251464844, + "loss": 0.0985, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.5516085028648376, + "rewards/margins": 10.155409812927246, + "rewards/rejected": -10.70701789855957, + "step": 690 + }, + { + "epoch": 1.05, + "learning_rate": 5.437247181051465e-07, + "logits/chosen": -1.1116018295288086, + "logits/rejected": -1.052459955215454, + "logps/chosen": -48.75881576538086, + "logps/rejected": -117.23747253417969, + "loss": 0.1613, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.09138579666614532, + "rewards/margins": 5.856074333190918, + "rewards/rejected": -5.947459697723389, + "step": 691 + }, + { + "epoch": 1.05, + "learning_rate": 5.424029622377546e-07, + "logits/chosen": -1.282362937927246, + "logits/rejected": -1.273923635482788, + "logps/chosen": -51.164772033691406, + "logps/rejected": -126.75209045410156, + "loss": 0.0455, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.18534865975379944, + "rewards/margins": 7.127676486968994, + "rewards/rejected": -6.942327976226807, + "step": 692 + }, + { + "epoch": 1.05, + "learning_rate": 5.410809078374054e-07, + "logits/chosen": -1.0203771591186523, + "logits/rejected": -0.940859854221344, + "logps/chosen": -76.81683349609375, + "logps/rejected": -148.52162170410156, + "loss": 0.0958, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.5949409008026123, + "rewards/margins": 6.496974468231201, + "rewards/rejected": -8.091915130615234, + "step": 693 + }, + { + "epoch": 1.05, + "learning_rate": 5.397585642118642e-07, + "logits/chosen": -1.1015177965164185, + "logits/rejected": -1.0720669031143188, + "logps/chosen": -40.1603889465332, + "logps/rejected": -75.624267578125, + "loss": 0.0951, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.00646597146987915, + "rewards/margins": 3.2788314819335938, + "rewards/rejected": -3.285297393798828, + "step": 694 + }, + { + "epoch": 1.06, + "learning_rate": 5.384359406709321e-07, + "logits/chosen": -1.0925345420837402, + "logits/rejected": -1.019098162651062, + "logps/chosen": -58.54066467285156, + "logps/rejected": -149.29501342773438, + "loss": 0.1441, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.05131404101848602, + "rewards/margins": 8.232137680053711, + "rewards/rejected": -8.180824279785156, + "step": 695 + }, + { + "epoch": 1.06, + "learning_rate": 5.371130465263812e-07, + "logits/chosen": -1.318426489830017, + "logits/rejected": -1.385157585144043, + "logps/chosen": -71.2918930053711, + "logps/rejected": -153.06190490722656, + "loss": 0.099, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.2728750705718994, + "rewards/margins": 7.585086345672607, + "rewards/rejected": -8.85796070098877, + "step": 696 + }, + { + "epoch": 1.06, + "learning_rate": 5.357898910918888e-07, + "logits/chosen": -1.0089809894561768, + "logits/rejected": -1.0487353801727295, + "logps/chosen": -68.19590759277344, + "logps/rejected": -142.8970947265625, + "loss": 0.1073, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.8273400664329529, + "rewards/margins": 6.7305779457092285, + "rewards/rejected": -7.557918548583984, + "step": 697 + }, + { + "epoch": 1.06, + "learning_rate": 5.344664836829714e-07, + "logits/chosen": -1.3282719850540161, + "logits/rejected": -1.225130558013916, + "logps/chosen": -77.95903015136719, + "logps/rejected": -204.59254455566406, + "loss": 0.1005, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.112104892730713, + "rewards/margins": 10.1814603805542, + "rewards/rejected": -11.29356575012207, + "step": 698 + }, + { + "epoch": 1.06, + "learning_rate": 5.331428336169198e-07, + "logits/chosen": -0.9961109757423401, + "logits/rejected": -0.9357060194015503, + "logps/chosen": -63.64835739135742, + "logps/rejected": -147.5065155029297, + "loss": 0.072, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.9150161743164062, + "rewards/margins": 7.08585262298584, + "rewards/rejected": -8.000868797302246, + "step": 699 + }, + { + "epoch": 1.06, + "learning_rate": 5.318189502127331e-07, + "logits/chosen": -1.2980554103851318, + "logits/rejected": -1.264257550239563, + "logps/chosen": -59.385414123535156, + "logps/rejected": -149.30679321289062, + "loss": 0.136, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.2459990531206131, + "rewards/margins": 7.867907524108887, + "rewards/rejected": -8.113906860351562, + "step": 700 + }, + { + "epoch": 1.06, + "learning_rate": 5.304948427910534e-07, + "logits/chosen": -1.3335720300674438, + "logits/rejected": -1.3249469995498657, + "logps/chosen": -73.70133972167969, + "logps/rejected": -159.0903778076172, + "loss": 0.0867, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.3161687850952148, + "rewards/margins": 7.501766204833984, + "rewards/rejected": -8.8179349899292, + "step": 701 + }, + { + "epoch": 1.07, + "learning_rate": 5.291705206740996e-07, + "logits/chosen": -1.1001014709472656, + "logits/rejected": -1.0837841033935547, + "logps/chosen": -51.384239196777344, + "logps/rejected": -125.36265563964844, + "loss": 0.0738, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.3471025228500366, + "rewards/margins": 6.566239356994629, + "rewards/rejected": -6.913341045379639, + "step": 702 + }, + { + "epoch": 1.07, + "learning_rate": 5.278459931856026e-07, + "logits/chosen": -1.2043085098266602, + "logits/rejected": -1.137099266052246, + "logps/chosen": -86.36485290527344, + "logps/rejected": -184.11178588867188, + "loss": 0.1143, + "rewards/accuracies": 0.9375, + "rewards/chosen": -2.149524688720703, + "rewards/margins": 8.116349220275879, + "rewards/rejected": -10.265874862670898, + "step": 703 + }, + { + "epoch": 1.07, + "learning_rate": 5.265212696507386e-07, + "logits/chosen": -1.2772239446640015, + "logits/rejected": -1.2306042909622192, + "logps/chosen": -76.60639953613281, + "logps/rejected": -166.2823486328125, + "loss": 0.0798, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.8685569763183594, + "rewards/margins": 7.428231716156006, + "rewards/rejected": -9.296789169311523, + "step": 704 + }, + { + "epoch": 1.07, + "learning_rate": 5.251963593960646e-07, + "logits/chosen": -0.8318597078323364, + "logits/rejected": -0.8045398592948914, + "logps/chosen": -71.75132751464844, + "logps/rejected": -161.3377685546875, + "loss": 0.0748, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.956432580947876, + "rewards/margins": 7.814340591430664, + "rewards/rejected": -9.770774841308594, + "step": 705 + }, + { + "epoch": 1.07, + "learning_rate": 5.238712717494517e-07, + "logits/chosen": -1.0191956758499146, + "logits/rejected": -1.0569789409637451, + "logps/chosen": -78.18263244628906, + "logps/rejected": -167.52734375, + "loss": 0.0967, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.746780514717102, + "rewards/margins": 7.930570125579834, + "rewards/rejected": -9.677350044250488, + "step": 706 + }, + { + "epoch": 1.07, + "learning_rate": 5.225460160400204e-07, + "logits/chosen": -1.0451310873031616, + "logits/rejected": -1.1556812524795532, + "logps/chosen": -95.20954132080078, + "logps/rejected": -165.25765991210938, + "loss": 0.1126, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.155846118927002, + "rewards/margins": 7.055589199066162, + "rewards/rejected": -9.211434364318848, + "step": 707 + }, + { + "epoch": 1.08, + "learning_rate": 5.212206015980741e-07, + "logits/chosen": -1.0930476188659668, + "logits/rejected": -1.013002872467041, + "logps/chosen": -83.26821899414062, + "logps/rejected": -197.95700073242188, + "loss": 0.0779, + "rewards/accuracies": 0.875, + "rewards/chosen": -2.0534558296203613, + "rewards/margins": 9.577096939086914, + "rewards/rejected": -11.630553245544434, + "step": 708 + }, + { + "epoch": 1.08, + "learning_rate": 5.198950377550338e-07, + "logits/chosen": -1.0042184591293335, + "logits/rejected": -1.0928484201431274, + "logps/chosen": -66.0024185180664, + "logps/rejected": -122.8520278930664, + "loss": 0.1358, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.2644383907318115, + "rewards/margins": 5.515614032745361, + "rewards/rejected": -6.780052185058594, + "step": 709 + }, + { + "epoch": 1.08, + "learning_rate": 5.185693338433723e-07, + "logits/chosen": -1.212796688079834, + "logits/rejected": -1.2040300369262695, + "logps/chosen": -68.95840454101562, + "logps/rejected": -158.29884338378906, + "loss": 0.1171, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.497808814048767, + "rewards/margins": 7.9521260261535645, + "rewards/rejected": -9.449934005737305, + "step": 710 + }, + { + "epoch": 1.08, + "learning_rate": 5.172434991965486e-07, + "logits/chosen": -1.2450261116027832, + "logits/rejected": -1.1200731992721558, + "logps/chosen": -77.91986083984375, + "logps/rejected": -185.11241149902344, + "loss": 0.0812, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.8076395988464355, + "rewards/margins": 9.032342910766602, + "rewards/rejected": -10.839981079101562, + "step": 711 + }, + { + "epoch": 1.08, + "learning_rate": 5.159175431489423e-07, + "logits/chosen": -0.7942067980766296, + "logits/rejected": -0.7741928100585938, + "logps/chosen": -62.70838165283203, + "logps/rejected": -142.3878631591797, + "loss": 0.061, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.2866111993789673, + "rewards/margins": 6.572202682495117, + "rewards/rejected": -7.858813285827637, + "step": 712 + }, + { + "epoch": 1.08, + "learning_rate": 5.145914750357871e-07, + "logits/chosen": -1.1640188694000244, + "logits/rejected": -1.197671890258789, + "logps/chosen": -68.85122680664062, + "logps/rejected": -147.9771728515625, + "loss": 0.0701, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.203361988067627, + "rewards/margins": 7.032736301422119, + "rewards/rejected": -8.236098289489746, + "step": 713 + }, + { + "epoch": 1.08, + "learning_rate": 5.132653041931066e-07, + "logits/chosen": -1.0039440393447876, + "logits/rejected": -1.0339112281799316, + "logps/chosen": -76.23931121826172, + "logps/rejected": -160.29669189453125, + "loss": 0.0564, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.1478352546691895, + "rewards/margins": 7.5223469734191895, + "rewards/rejected": -8.670182228088379, + "step": 714 + }, + { + "epoch": 1.09, + "learning_rate": 5.119390399576468e-07, + "logits/chosen": -0.8470580577850342, + "logits/rejected": -0.9359773993492126, + "logps/chosen": -57.678245544433594, + "logps/rejected": -119.07586669921875, + "loss": 0.0875, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.49064531922340393, + "rewards/margins": 6.0010857582092285, + "rewards/rejected": -6.4917311668396, + "step": 715 + }, + { + "epoch": 1.09, + "learning_rate": 5.106126916668118e-07, + "logits/chosen": -1.144890308380127, + "logits/rejected": -1.0987181663513184, + "logps/chosen": -64.77657318115234, + "logps/rejected": -135.87718200683594, + "loss": 0.0479, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.6729116439819336, + "rewards/margins": 6.5454912185668945, + "rewards/rejected": -7.2184038162231445, + "step": 716 + }, + { + "epoch": 1.09, + "learning_rate": 5.09286268658597e-07, + "logits/chosen": -1.190121054649353, + "logits/rejected": -1.1410903930664062, + "logps/chosen": -83.95819091796875, + "logps/rejected": -177.8282470703125, + "loss": 0.0826, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.7526713609695435, + "rewards/margins": 7.734089374542236, + "rewards/rejected": -9.486761093139648, + "step": 717 + }, + { + "epoch": 1.09, + "learning_rate": 5.079597802715244e-07, + "logits/chosen": -1.1426438093185425, + "logits/rejected": -1.1871936321258545, + "logps/chosen": -57.17544937133789, + "logps/rejected": -133.5987548828125, + "loss": 0.0571, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.2977384924888611, + "rewards/margins": 6.940124988555908, + "rewards/rejected": -7.237863540649414, + "step": 718 + }, + { + "epoch": 1.09, + "learning_rate": 5.066332358445759e-07, + "logits/chosen": -1.3283708095550537, + "logits/rejected": -1.2297604084014893, + "logps/chosen": -73.53825378417969, + "logps/rejected": -182.50379943847656, + "loss": 0.0742, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.9024177193641663, + "rewards/margins": 8.631746292114258, + "rewards/rejected": -9.534162521362305, + "step": 719 + }, + { + "epoch": 1.09, + "learning_rate": 5.053066447171282e-07, + "logits/chosen": -1.1588116884231567, + "logits/rejected": -1.0975029468536377, + "logps/chosen": -52.616249084472656, + "logps/rejected": -128.71990966796875, + "loss": 0.0525, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.11890238523483276, + "rewards/margins": 6.472389221191406, + "rewards/rejected": -6.35348653793335, + "step": 720 + }, + { + "epoch": 1.1, + "learning_rate": 5.039800162288861e-07, + "logits/chosen": -1.2566020488739014, + "logits/rejected": -1.1881121397018433, + "logps/chosen": -66.51480102539062, + "logps/rejected": -154.63040161132812, + "loss": 0.0662, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.4931876063346863, + "rewards/margins": 7.1658220291137695, + "rewards/rejected": -7.6590094566345215, + "step": 721 + }, + { + "epoch": 1.1, + "learning_rate": 5.026533597198185e-07, + "logits/chosen": -1.1822502613067627, + "logits/rejected": -1.270106315612793, + "logps/chosen": -58.7943115234375, + "logps/rejected": -158.77145385742188, + "loss": 0.0786, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.7609018683433533, + "rewards/margins": 8.432029724121094, + "rewards/rejected": -9.19293212890625, + "step": 722 + }, + { + "epoch": 1.1, + "learning_rate": 5.013266845300907e-07, + "logits/chosen": -1.0390766859054565, + "logits/rejected": -1.0265178680419922, + "logps/chosen": -81.84119415283203, + "logps/rejected": -185.984619140625, + "loss": 0.1292, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.834659993648529, + "rewards/margins": 9.396257400512695, + "rewards/rejected": -10.230918884277344, + "step": 723 + }, + { + "epoch": 1.1, + "learning_rate": 5e-07, + "logits/chosen": -1.0787450075149536, + "logits/rejected": -1.0342082977294922, + "logps/chosen": -63.2120475769043, + "logps/rejected": -176.89993286132812, + "loss": 0.0899, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.23369872570037842, + "rewards/margins": 9.546956062316895, + "rewards/rejected": -9.780654907226562, + "step": 724 + }, + { + "epoch": 1.1, + "learning_rate": 4.986733154699093e-07, + "logits/chosen": -0.9337677955627441, + "logits/rejected": -0.9124805927276611, + "logps/chosen": -61.646949768066406, + "logps/rejected": -143.59019470214844, + "loss": 0.0293, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.0404541492462158, + "rewards/margins": 6.85941743850708, + "rewards/rejected": -7.899870872497559, + "step": 725 + }, + { + "epoch": 1.1, + "learning_rate": 4.973466402801817e-07, + "logits/chosen": -0.9301645755767822, + "logits/rejected": -0.9025458097457886, + "logps/chosen": -73.25839233398438, + "logps/rejected": -163.77919006347656, + "loss": 0.1275, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.781755805015564, + "rewards/margins": 7.804413795471191, + "rewards/rejected": -8.586169242858887, + "step": 726 + }, + { + "epoch": 1.1, + "learning_rate": 4.96019983771114e-07, + "logits/chosen": -1.2382862567901611, + "logits/rejected": -1.2279412746429443, + "logps/chosen": -59.01118850708008, + "logps/rejected": -126.78030395507812, + "loss": 0.058, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.22560197114944458, + "rewards/margins": 6.76422643661499, + "rewards/rejected": -6.538625240325928, + "step": 727 + }, + { + "epoch": 1.11, + "learning_rate": 4.946933552828719e-07, + "logits/chosen": -1.1101487874984741, + "logits/rejected": -1.0449861288070679, + "logps/chosen": -67.08702087402344, + "logps/rejected": -158.53958129882812, + "loss": 0.0577, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.9119900465011597, + "rewards/margins": 7.477686882019043, + "rewards/rejected": -8.389676094055176, + "step": 728 + }, + { + "epoch": 1.11, + "learning_rate": 4.933667641554239e-07, + "logits/chosen": -1.3548754453659058, + "logits/rejected": -1.3058669567108154, + "logps/chosen": -49.29551696777344, + "logps/rejected": -130.4243927001953, + "loss": 0.0712, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.08856362104415894, + "rewards/margins": 6.403277397155762, + "rewards/rejected": -6.491840362548828, + "step": 729 + }, + { + "epoch": 1.11, + "learning_rate": 4.920402197284755e-07, + "logits/chosen": -1.146132469177246, + "logits/rejected": -1.08512544631958, + "logps/chosen": -42.733612060546875, + "logps/rejected": -121.80816650390625, + "loss": 0.0672, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.32784441113471985, + "rewards/margins": 6.611300945281982, + "rewards/rejected": -6.283456325531006, + "step": 730 + }, + { + "epoch": 1.11, + "learning_rate": 4.907137313414029e-07, + "logits/chosen": -1.1103661060333252, + "logits/rejected": -1.1141341924667358, + "logps/chosen": -51.15158462524414, + "logps/rejected": -115.07316589355469, + "loss": 0.134, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.09807372093200684, + "rewards/margins": 6.130924701690674, + "rewards/rejected": -6.032850742340088, + "step": 731 + }, + { + "epoch": 1.11, + "learning_rate": 4.893873083331882e-07, + "logits/chosen": -1.0536171197891235, + "logits/rejected": -1.0562716722488403, + "logps/chosen": -109.25856018066406, + "logps/rejected": -196.98011779785156, + "loss": 0.1269, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.6777667999267578, + "rewards/margins": 8.406275749206543, + "rewards/rejected": -10.0840425491333, + "step": 732 + }, + { + "epoch": 1.11, + "learning_rate": 4.880609600423532e-07, + "logits/chosen": -1.1645655632019043, + "logits/rejected": -1.2337405681610107, + "logps/chosen": -54.793243408203125, + "logps/rejected": -112.279541015625, + "loss": 0.0856, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.10907458513975143, + "rewards/margins": 5.2775654792785645, + "rewards/rejected": -5.386639595031738, + "step": 733 + }, + { + "epoch": 1.12, + "learning_rate": 4.867346958068934e-07, + "logits/chosen": -1.1355609893798828, + "logits/rejected": -1.1663737297058105, + "logps/chosen": -80.99940490722656, + "logps/rejected": -165.85794067382812, + "loss": 0.0913, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.6650447845458984, + "rewards/margins": 7.612807273864746, + "rewards/rejected": -9.277853012084961, + "step": 734 + }, + { + "epoch": 1.12, + "learning_rate": 4.854085249642127e-07, + "logits/chosen": -1.149308681488037, + "logits/rejected": -1.1109539270401, + "logps/chosen": -51.61331558227539, + "logps/rejected": -123.82764434814453, + "loss": 0.1262, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.23800213634967804, + "rewards/margins": 6.6616621017456055, + "rewards/rejected": -6.899663925170898, + "step": 735 + }, + { + "epoch": 1.12, + "learning_rate": 4.840824568510579e-07, + "logits/chosen": -1.017869472503662, + "logits/rejected": -1.0294277667999268, + "logps/chosen": -53.62151336669922, + "logps/rejected": -113.24836730957031, + "loss": 0.0701, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.047505542635917664, + "rewards/margins": 5.728215217590332, + "rewards/rejected": -5.775720596313477, + "step": 736 + }, + { + "epoch": 1.12, + "learning_rate": 4.827565008034513e-07, + "logits/chosen": -1.199344515800476, + "logits/rejected": -1.1634290218353271, + "logps/chosen": -58.40774154663086, + "logps/rejected": -160.16136169433594, + "loss": 0.0303, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.14727118611335754, + "rewards/margins": 8.634309768676758, + "rewards/rejected": -8.781580924987793, + "step": 737 + }, + { + "epoch": 1.12, + "learning_rate": 4.814306661566276e-07, + "logits/chosen": -1.1273925304412842, + "logits/rejected": -1.0789045095443726, + "logps/chosen": -44.35386276245117, + "logps/rejected": -138.4711151123047, + "loss": 0.0861, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.6509190797805786, + "rewards/margins": 7.575811862945557, + "rewards/rejected": -6.924892425537109, + "step": 738 + }, + { + "epoch": 1.12, + "learning_rate": 4.801049622449661e-07, + "logits/chosen": -1.1626410484313965, + "logits/rejected": -1.0463327169418335, + "logps/chosen": -91.6008529663086, + "logps/rejected": -211.8026580810547, + "loss": 0.0279, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.2525895833969116, + "rewards/margins": 10.100064277648926, + "rewards/rejected": -11.352653503417969, + "step": 739 + }, + { + "epoch": 1.12, + "learning_rate": 4.787793984019259e-07, + "logits/chosen": -0.9621848464012146, + "logits/rejected": -0.9286797046661377, + "logps/chosen": -56.69253158569336, + "logps/rejected": -142.10772705078125, + "loss": 0.0685, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.3490813374519348, + "rewards/margins": 7.836462497711182, + "rewards/rejected": -8.18554401397705, + "step": 740 + }, + { + "epoch": 1.13, + "learning_rate": 4.774539839599795e-07, + "logits/chosen": -1.0346853733062744, + "logits/rejected": -0.9935694336891174, + "logps/chosen": -58.07199478149414, + "logps/rejected": -158.4248046875, + "loss": 0.0885, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.49401575326919556, + "rewards/margins": 8.622398376464844, + "rewards/rejected": -9.116415023803711, + "step": 741 + }, + { + "epoch": 1.13, + "learning_rate": 4.7612872825054817e-07, + "logits/chosen": -1.0496529340744019, + "logits/rejected": -0.9647389054298401, + "logps/chosen": -62.4256591796875, + "logps/rejected": -206.42465209960938, + "loss": 0.1518, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.1937393844127655, + "rewards/margins": 12.18332290649414, + "rewards/rejected": -12.377062797546387, + "step": 742 + }, + { + "epoch": 1.13, + "learning_rate": 4.748036406039355e-07, + "logits/chosen": -1.1184381246566772, + "logits/rejected": -1.0451523065567017, + "logps/chosen": -66.47551727294922, + "logps/rejected": -167.14010620117188, + "loss": 0.0533, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.6011109948158264, + "rewards/margins": 8.966035842895508, + "rewards/rejected": -9.567147254943848, + "step": 743 + }, + { + "epoch": 1.13, + "learning_rate": 4.7347873034926146e-07, + "logits/chosen": -0.9570289254188538, + "logits/rejected": -0.9113000631332397, + "logps/chosen": -77.79519653320312, + "logps/rejected": -189.1246795654297, + "loss": 0.167, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.7696459889411926, + "rewards/margins": 9.580473899841309, + "rewards/rejected": -10.350118637084961, + "step": 744 + }, + { + "epoch": 1.13, + "learning_rate": 4.7215400681439743e-07, + "logits/chosen": -1.0565179586410522, + "logits/rejected": -1.0138272047042847, + "logps/chosen": -78.86212158203125, + "logps/rejected": -174.55279541015625, + "loss": 0.0503, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.0645883083343506, + "rewards/margins": 8.36618423461914, + "rewards/rejected": -9.43077278137207, + "step": 745 + }, + { + "epoch": 1.13, + "learning_rate": 4.708294793259004e-07, + "logits/chosen": -1.296499252319336, + "logits/rejected": -1.2552895545959473, + "logps/chosen": -48.52935028076172, + "logps/rejected": -123.3971939086914, + "loss": 0.0774, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.389154851436615, + "rewards/margins": 6.000330448150635, + "rewards/rejected": -6.3894853591918945, + "step": 746 + }, + { + "epoch": 1.13, + "learning_rate": 4.6950515720894655e-07, + "logits/chosen": -1.2907040119171143, + "logits/rejected": -1.2215681076049805, + "logps/chosen": -69.718994140625, + "logps/rejected": -184.017333984375, + "loss": 0.0883, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.23798534274101257, + "rewards/margins": 10.50714111328125, + "rewards/rejected": -10.745126724243164, + "step": 747 + }, + { + "epoch": 1.14, + "learning_rate": 4.681810497872668e-07, + "logits/chosen": -1.1115891933441162, + "logits/rejected": -1.0429208278656006, + "logps/chosen": -75.26403045654297, + "logps/rejected": -183.2494659423828, + "loss": 0.0792, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.2758240699768066, + "rewards/margins": 8.724857330322266, + "rewards/rejected": -10.00068187713623, + "step": 748 + }, + { + "epoch": 1.14, + "learning_rate": 4.6685716638308016e-07, + "logits/chosen": -1.1416871547698975, + "logits/rejected": -1.0443594455718994, + "logps/chosen": -57.0560417175293, + "logps/rejected": -134.08177185058594, + "loss": 0.0655, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.595421314239502, + "rewards/margins": 6.1352925300598145, + "rewards/rejected": -6.730713844299316, + "step": 749 + }, + { + "epoch": 1.14, + "learning_rate": 4.6553351631702877e-07, + "logits/chosen": -1.1689854860305786, + "logits/rejected": -1.208797812461853, + "logps/chosen": -77.22966766357422, + "logps/rejected": -198.64471435546875, + "loss": 0.074, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.0628044605255127, + "rewards/margins": 11.271867752075195, + "rewards/rejected": -12.334672927856445, + "step": 750 + }, + { + "epoch": 1.14, + "learning_rate": 4.6421010890811124e-07, + "logits/chosen": -0.98302161693573, + "logits/rejected": -1.1160837411880493, + "logps/chosen": -66.82015991210938, + "logps/rejected": -113.2428970336914, + "loss": 0.0948, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.4074597954750061, + "rewards/margins": 5.847441673278809, + "rewards/rejected": -6.25490140914917, + "step": 751 + }, + { + "epoch": 1.14, + "learning_rate": 4.628869534736187e-07, + "logits/chosen": -1.2641061544418335, + "logits/rejected": -1.1559064388275146, + "logps/chosen": -76.30941772460938, + "logps/rejected": -177.64637756347656, + "loss": 0.0812, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.2699906826019287, + "rewards/margins": 8.294047355651855, + "rewards/rejected": -9.564037322998047, + "step": 752 + }, + { + "epoch": 1.14, + "learning_rate": 4.615640593290679e-07, + "logits/chosen": -1.1859060525894165, + "logits/rejected": -1.150189995765686, + "logps/chosen": -60.840484619140625, + "logps/rejected": -160.75506591796875, + "loss": 0.056, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.03157242760062218, + "rewards/margins": 8.924668312072754, + "rewards/rejected": -8.9562406539917, + "step": 753 + }, + { + "epoch": 1.15, + "learning_rate": 4.6024143578813585e-07, + "logits/chosen": -1.1408933401107788, + "logits/rejected": -1.115440011024475, + "logps/chosen": -47.346519470214844, + "logps/rejected": -107.71475219726562, + "loss": 0.1403, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.033688440918922424, + "rewards/margins": 5.437056541442871, + "rewards/rejected": -5.40336799621582, + "step": 754 + }, + { + "epoch": 1.15, + "learning_rate": 4.589190921625945e-07, + "logits/chosen": -1.2576284408569336, + "logits/rejected": -1.3243356943130493, + "logps/chosen": -54.915672302246094, + "logps/rejected": -117.61421203613281, + "loss": 0.076, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.27700039744377136, + "rewards/margins": 6.533145427703857, + "rewards/rejected": -6.256145000457764, + "step": 755 + }, + { + "epoch": 1.15, + "learning_rate": 4.5759703776224555e-07, + "logits/chosen": -1.008704423904419, + "logits/rejected": -1.0297294855117798, + "logps/chosen": -71.18299865722656, + "logps/rejected": -167.969970703125, + "loss": 0.1322, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.2875728905200958, + "rewards/margins": 9.068160057067871, + "rewards/rejected": -9.355732917785645, + "step": 756 + }, + { + "epoch": 1.15, + "learning_rate": 4.562752818948535e-07, + "logits/chosen": -1.204525351524353, + "logits/rejected": -1.1385844945907593, + "logps/chosen": -61.3125, + "logps/rejected": -150.655029296875, + "loss": 0.0791, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.0254082679748535, + "rewards/margins": 7.04811954498291, + "rewards/rejected": -8.073528289794922, + "step": 757 + }, + { + "epoch": 1.15, + "learning_rate": 4.549538338660819e-07, + "logits/chosen": -1.015628695487976, + "logits/rejected": -1.0417346954345703, + "logps/chosen": -59.74385070800781, + "logps/rejected": -128.64451599121094, + "loss": 0.0606, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.5621964335441589, + "rewards/margins": 5.849483013153076, + "rewards/rejected": -6.411679744720459, + "step": 758 + }, + { + "epoch": 1.15, + "learning_rate": 4.536327029794266e-07, + "logits/chosen": -1.0389518737792969, + "logits/rejected": -0.9769172072410583, + "logps/chosen": -52.239601135253906, + "logps/rejected": -131.2997589111328, + "loss": 0.0649, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.1164703369140625, + "rewards/margins": 6.758030891418457, + "rewards/rejected": -6.874501705169678, + "step": 759 + }, + { + "epoch": 1.15, + "learning_rate": 4.52311898536151e-07, + "logits/chosen": -1.0630335807800293, + "logits/rejected": -0.9963873028755188, + "logps/chosen": -58.79426574707031, + "logps/rejected": -149.952392578125, + "loss": 0.0734, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.3736463189125061, + "rewards/margins": 7.867257595062256, + "rewards/rejected": -8.240903854370117, + "step": 760 + }, + { + "epoch": 1.16, + "learning_rate": 4.5099142983521963e-07, + "logits/chosen": -1.0330792665481567, + "logits/rejected": -1.0546962022781372, + "logps/chosen": -52.70293045043945, + "logps/rejected": -146.3146514892578, + "loss": 0.0577, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.17132572829723358, + "rewards/margins": 8.526296615600586, + "rewards/rejected": -8.354971885681152, + "step": 761 + }, + { + "epoch": 1.16, + "learning_rate": 4.4967130617323396e-07, + "logits/chosen": -0.9147458672523499, + "logits/rejected": -0.6992746591567993, + "logps/chosen": -74.70170593261719, + "logps/rejected": -243.60031127929688, + "loss": 0.0623, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.19599175453186035, + "rewards/margins": 13.283580780029297, + "rewards/rejected": -13.479572296142578, + "step": 762 + }, + { + "epoch": 1.16, + "learning_rate": 4.4835153684436567e-07, + "logits/chosen": -1.1678838729858398, + "logits/rejected": -1.1080782413482666, + "logps/chosen": -64.56056213378906, + "logps/rejected": -157.8324432373047, + "loss": 0.0514, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.7837637066841125, + "rewards/margins": 7.835894584655762, + "rewards/rejected": -8.619658470153809, + "step": 763 + }, + { + "epoch": 1.16, + "learning_rate": 4.47032131140292e-07, + "logits/chosen": -1.1793973445892334, + "logits/rejected": -1.0700643062591553, + "logps/chosen": -70.48400115966797, + "logps/rejected": -186.6561279296875, + "loss": 0.0555, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.7886835932731628, + "rewards/margins": 10.022167205810547, + "rewards/rejected": -10.81085205078125, + "step": 764 + }, + { + "epoch": 1.16, + "learning_rate": 4.4571309835013023e-07, + "logits/chosen": -1.312821626663208, + "logits/rejected": -1.2918648719787598, + "logps/chosen": -52.059349060058594, + "logps/rejected": -124.93989562988281, + "loss": 0.077, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.7141761779785156, + "rewards/margins": 5.787374019622803, + "rewards/rejected": -6.50154972076416, + "step": 765 + }, + { + "epoch": 1.16, + "learning_rate": 4.4439444776037217e-07, + "logits/chosen": -1.3349573612213135, + "logits/rejected": -1.3212847709655762, + "logps/chosen": -77.37397766113281, + "logps/rejected": -165.1543731689453, + "loss": 0.0884, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.3379329442977905, + "rewards/margins": 7.1123127937316895, + "rewards/rejected": -8.450244903564453, + "step": 766 + }, + { + "epoch": 1.17, + "learning_rate": 4.430761886548189e-07, + "logits/chosen": -1.183845043182373, + "logits/rejected": -1.145720362663269, + "logps/chosen": -57.94763946533203, + "logps/rejected": -123.90760803222656, + "loss": 0.0495, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.35080718994140625, + "rewards/margins": 6.037484645843506, + "rewards/rejected": -6.388291835784912, + "step": 767 + }, + { + "epoch": 1.17, + "learning_rate": 4.417583303145147e-07, + "logits/chosen": -1.2359590530395508, + "logits/rejected": -1.1837353706359863, + "logps/chosen": -59.06725311279297, + "logps/rejected": -127.3287582397461, + "loss": 0.0818, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.2838151752948761, + "rewards/margins": 6.447656631469727, + "rewards/rejected": -6.731472015380859, + "step": 768 + }, + { + "epoch": 1.17, + "learning_rate": 4.4044088201768305e-07, + "logits/chosen": -1.1694906949996948, + "logits/rejected": -1.2630912065505981, + "logps/chosen": -59.07255935668945, + "logps/rejected": -146.60903930664062, + "loss": 0.0751, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.7602745890617371, + "rewards/margins": 8.458105087280273, + "rewards/rejected": -7.6978302001953125, + "step": 769 + }, + { + "epoch": 1.17, + "learning_rate": 4.391238530396605e-07, + "logits/chosen": -1.2331902980804443, + "logits/rejected": -1.1801191568374634, + "logps/chosen": -63.41059112548828, + "logps/rejected": -170.8983154296875, + "loss": 0.0899, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.19194763898849487, + "rewards/margins": 9.201603889465332, + "rewards/rejected": -9.009657859802246, + "step": 770 + }, + { + "epoch": 1.17, + "learning_rate": 4.378072526528307e-07, + "logits/chosen": -1.1899259090423584, + "logits/rejected": -1.2759406566619873, + "logps/chosen": -55.15724563598633, + "logps/rejected": -125.78352355957031, + "loss": 0.0513, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.8366152048110962, + "rewards/margins": 8.044254302978516, + "rewards/rejected": -7.207639694213867, + "step": 771 + }, + { + "epoch": 1.17, + "learning_rate": 4.364910901265606e-07, + "logits/chosen": -1.1296507120132446, + "logits/rejected": -1.1952402591705322, + "logps/chosen": -77.33753967285156, + "logps/rejected": -154.226806640625, + "loss": 0.0335, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.2563350200653076, + "rewards/margins": 8.133346557617188, + "rewards/rejected": -8.389680862426758, + "step": 772 + }, + { + "epoch": 1.17, + "learning_rate": 4.351753747271345e-07, + "logits/chosen": -1.1918877363204956, + "logits/rejected": -1.2347781658172607, + "logps/chosen": -68.38977813720703, + "logps/rejected": -152.2249755859375, + "loss": 0.1108, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.38625311851501465, + "rewards/margins": 8.195649147033691, + "rewards/rejected": -8.581901550292969, + "step": 773 + }, + { + "epoch": 1.18, + "learning_rate": 4.3386011571768793e-07, + "logits/chosen": -1.0707042217254639, + "logits/rejected": -1.1057032346725464, + "logps/chosen": -69.99359893798828, + "logps/rejected": -173.17208862304688, + "loss": 0.0633, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.719723641872406, + "rewards/margins": 8.93809986114502, + "rewards/rejected": -9.657824516296387, + "step": 774 + }, + { + "epoch": 1.18, + "learning_rate": 4.3254532235814413e-07, + "logits/chosen": -1.1420342922210693, + "logits/rejected": -1.1160727739334106, + "logps/chosen": -69.51921081542969, + "logps/rejected": -153.108154296875, + "loss": 0.061, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.3570714294910431, + "rewards/margins": 7.29402494430542, + "rewards/rejected": -7.651096343994141, + "step": 775 + }, + { + "epoch": 1.18, + "learning_rate": 4.312310039051476e-07, + "logits/chosen": -1.095452070236206, + "logits/rejected": -1.0368789434432983, + "logps/chosen": -83.53199768066406, + "logps/rejected": -195.30474853515625, + "loss": 0.1013, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.0821952819824219, + "rewards/margins": 9.629199981689453, + "rewards/rejected": -10.711397171020508, + "step": 776 + }, + { + "epoch": 1.18, + "learning_rate": 4.2991716961199944e-07, + "logits/chosen": -0.8502026796340942, + "logits/rejected": -0.7925280928611755, + "logps/chosen": -79.38339233398438, + "logps/rejected": -211.51736450195312, + "loss": 0.0997, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.30591297149658203, + "rewards/margins": 11.01159954071045, + "rewards/rejected": -11.317513465881348, + "step": 777 + }, + { + "epoch": 1.18, + "learning_rate": 4.2860382872859183e-07, + "logits/chosen": -1.3151302337646484, + "logits/rejected": -1.2342097759246826, + "logps/chosen": -58.19136047363281, + "logps/rejected": -159.36239624023438, + "loss": 0.0546, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.2004951536655426, + "rewards/margins": 8.885076522827148, + "rewards/rejected": -8.684579849243164, + "step": 778 + }, + { + "epoch": 1.18, + "learning_rate": 4.2729099050134356e-07, + "logits/chosen": -1.213114857673645, + "logits/rejected": -1.2209107875823975, + "logps/chosen": -74.55901336669922, + "logps/rejected": -196.17433166503906, + "loss": 0.0438, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.4468156695365906, + "rewards/margins": 9.722015380859375, + "rewards/rejected": -10.168830871582031, + "step": 779 + }, + { + "epoch": 1.18, + "learning_rate": 4.259786641731343e-07, + "logits/chosen": -1.1325987577438354, + "logits/rejected": -1.2110825777053833, + "logps/chosen": -73.87116241455078, + "logps/rejected": -161.33755493164062, + "loss": 0.0677, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.8084357976913452, + "rewards/margins": 8.306604385375977, + "rewards/rejected": -9.115039825439453, + "step": 780 + }, + { + "epoch": 1.19, + "learning_rate": 4.246668589832396e-07, + "logits/chosen": -1.125104546546936, + "logits/rejected": -1.0887819528579712, + "logps/chosen": -59.11391067504883, + "logps/rejected": -134.1604766845703, + "loss": 0.0513, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.044446349143981934, + "rewards/margins": 6.709604740142822, + "rewards/rejected": -6.665159225463867, + "step": 781 + }, + { + "epoch": 1.19, + "learning_rate": 4.2335558416726627e-07, + "logits/chosen": -1.0693738460540771, + "logits/rejected": -1.0744346380233765, + "logps/chosen": -52.576473236083984, + "logps/rejected": -119.21489715576172, + "loss": 0.0383, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.3390309512615204, + "rewards/margins": 6.409571170806885, + "rewards/rejected": -6.070540428161621, + "step": 782 + }, + { + "epoch": 1.19, + "learning_rate": 4.2204484895708714e-07, + "logits/chosen": -1.290875792503357, + "logits/rejected": -1.282123327255249, + "logps/chosen": -56.516658782958984, + "logps/rejected": -121.05769348144531, + "loss": 0.1121, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.08843201398849487, + "rewards/margins": 6.462881565093994, + "rewards/rejected": -6.551313400268555, + "step": 783 + }, + { + "epoch": 1.19, + "learning_rate": 4.2073466258077556e-07, + "logits/chosen": -1.4439988136291504, + "logits/rejected": -1.4561644792556763, + "logps/chosen": -73.92353820800781, + "logps/rejected": -142.3273162841797, + "loss": 0.1049, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.6018531918525696, + "rewards/margins": 6.388706207275391, + "rewards/rejected": -6.9905595779418945, + "step": 784 + }, + { + "epoch": 1.19, + "learning_rate": 4.194250342625413e-07, + "logits/chosen": -1.2636879682540894, + "logits/rejected": -1.0962374210357666, + "logps/chosen": -57.09210205078125, + "logps/rejected": -185.80169677734375, + "loss": 0.0585, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.26259636878967285, + "rewards/margins": 10.314652442932129, + "rewards/rejected": -10.052056312561035, + "step": 785 + }, + { + "epoch": 1.19, + "learning_rate": 4.18115973222665e-07, + "logits/chosen": -1.1007850170135498, + "logits/rejected": -0.9807206392288208, + "logps/chosen": -54.816993713378906, + "logps/rejected": -158.8098907470703, + "loss": 0.1046, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.10611987113952637, + "rewards/margins": 8.275077819824219, + "rewards/rejected": -8.381197929382324, + "step": 786 + }, + { + "epoch": 1.2, + "learning_rate": 4.1680748867743385e-07, + "logits/chosen": -1.102643609046936, + "logits/rejected": -1.0911848545074463, + "logps/chosen": -53.98735809326172, + "logps/rejected": -147.85398864746094, + "loss": 0.0512, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.2515929639339447, + "rewards/margins": 8.081626892089844, + "rewards/rejected": -7.830034255981445, + "step": 787 + }, + { + "epoch": 1.2, + "learning_rate": 4.154995898390755e-07, + "logits/chosen": -1.157513976097107, + "logits/rejected": -1.1232832670211792, + "logps/chosen": -61.06407928466797, + "logps/rejected": -150.46083068847656, + "loss": 0.054, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.15333883464336395, + "rewards/margins": 7.626152038574219, + "rewards/rejected": -7.779490947723389, + "step": 788 + }, + { + "epoch": 1.2, + "learning_rate": 4.1419228591569466e-07, + "logits/chosen": -1.1882693767547607, + "logits/rejected": -1.1941783428192139, + "logps/chosen": -80.35359191894531, + "logps/rejected": -202.92909240722656, + "loss": 0.0402, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.3616008162498474, + "rewards/margins": 11.065812110900879, + "rewards/rejected": -11.427413940429688, + "step": 789 + }, + { + "epoch": 1.2, + "learning_rate": 4.1288558611120755e-07, + "logits/chosen": -1.2141138315200806, + "logits/rejected": -1.2202541828155518, + "logps/chosen": -78.9452133178711, + "logps/rejected": -175.32675170898438, + "loss": 0.0625, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.28721117973327637, + "rewards/margins": 9.105508804321289, + "rewards/rejected": -8.818297386169434, + "step": 790 + }, + { + "epoch": 1.2, + "learning_rate": 4.115794996252768e-07, + "logits/chosen": -1.2565443515777588, + "logits/rejected": -1.2638403177261353, + "logps/chosen": -42.99260711669922, + "logps/rejected": -106.03839874267578, + "loss": 0.035, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.6804541349411011, + "rewards/margins": 5.897651672363281, + "rewards/rejected": -5.217196941375732, + "step": 791 + }, + { + "epoch": 1.2, + "learning_rate": 4.102740356532473e-07, + "logits/chosen": -1.3011318445205688, + "logits/rejected": -1.3293147087097168, + "logps/chosen": -63.77526092529297, + "logps/rejected": -138.43740844726562, + "loss": 0.0363, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.5142470598220825, + "rewards/margins": 7.463236331939697, + "rewards/rejected": -6.948988914489746, + "step": 792 + }, + { + "epoch": 1.2, + "learning_rate": 4.089692033860815e-07, + "logits/chosen": -0.9690642356872559, + "logits/rejected": -0.9215153455734253, + "logps/chosen": -64.09129333496094, + "logps/rejected": -166.85317993164062, + "loss": 0.0386, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.1762584149837494, + "rewards/margins": 8.921332359313965, + "rewards/rejected": -9.097590446472168, + "step": 793 + }, + { + "epoch": 1.21, + "learning_rate": 4.0766501201029363e-07, + "logits/chosen": -1.0228105783462524, + "logits/rejected": -0.9508757591247559, + "logps/chosen": -81.23611450195312, + "logps/rejected": -198.60714721679688, + "loss": 0.0862, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.1831510066986084, + "rewards/margins": 9.553539276123047, + "rewards/rejected": -10.736690521240234, + "step": 794 + }, + { + "epoch": 1.21, + "learning_rate": 4.0636147070788643e-07, + "logits/chosen": -1.290502667427063, + "logits/rejected": -1.2714859247207642, + "logps/chosen": -56.200138092041016, + "logps/rejected": -134.22470092773438, + "loss": 0.0399, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.44425803422927856, + "rewards/margins": 6.808338165283203, + "rewards/rejected": -7.252595901489258, + "step": 795 + }, + { + "epoch": 1.21, + "learning_rate": 4.0505858865628575e-07, + "logits/chosen": -1.2448666095733643, + "logits/rejected": -1.2700674533843994, + "logps/chosen": -70.80340576171875, + "logps/rejected": -155.4434051513672, + "loss": 0.032, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.6390804052352905, + "rewards/margins": 7.447866916656494, + "rewards/rejected": -8.086947441101074, + "step": 796 + }, + { + "epoch": 1.21, + "learning_rate": 4.0375637502827617e-07, + "logits/chosen": -1.1842539310455322, + "logits/rejected": -1.098061442375183, + "logps/chosen": -56.274715423583984, + "logps/rejected": -158.17466735839844, + "loss": 0.0202, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.02078975737094879, + "rewards/margins": 9.005572319030762, + "rewards/rejected": -8.984783172607422, + "step": 797 + }, + { + "epoch": 1.21, + "learning_rate": 4.0245483899193586e-07, + "logits/chosen": -1.2747535705566406, + "logits/rejected": -1.1594761610031128, + "logps/chosen": -74.98949432373047, + "logps/rejected": -149.13185119628906, + "loss": 0.0583, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.5290836095809937, + "rewards/margins": 6.066576957702637, + "rewards/rejected": -7.595660209655762, + "step": 798 + }, + { + "epoch": 1.21, + "learning_rate": 4.011539897105729e-07, + "logits/chosen": -1.3800020217895508, + "logits/rejected": -1.3897123336791992, + "logps/chosen": -66.18260192871094, + "logps/rejected": -163.44715881347656, + "loss": 0.0388, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.5650410652160645, + "rewards/margins": 8.63727855682373, + "rewards/rejected": -9.202320098876953, + "step": 799 + }, + { + "epoch": 1.22, + "learning_rate": 3.9985383634266047e-07, + "logits/chosen": -1.1964800357818604, + "logits/rejected": -1.191982626914978, + "logps/chosen": -55.670166015625, + "logps/rejected": -126.46448516845703, + "loss": 0.0501, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.5088732838630676, + "rewards/margins": 6.745451927185059, + "rewards/rejected": -7.254324436187744, + "step": 800 + }, + { + "epoch": 1.22, + "learning_rate": 3.985543880417716e-07, + "logits/chosen": -1.2433134317398071, + "logits/rejected": -1.143684983253479, + "logps/chosen": -52.155784606933594, + "logps/rejected": -148.7881317138672, + "loss": 0.0432, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.35071849822998047, + "rewards/margins": 7.880721569061279, + "rewards/rejected": -8.231439590454102, + "step": 801 + }, + { + "epoch": 1.22, + "learning_rate": 3.9725565395651604e-07, + "logits/chosen": -1.1196131706237793, + "logits/rejected": -1.0766024589538574, + "logps/chosen": -71.76590728759766, + "logps/rejected": -156.6300811767578, + "loss": 0.0427, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.136284589767456, + "rewards/margins": 7.6406779289245605, + "rewards/rejected": -8.776963233947754, + "step": 802 + }, + { + "epoch": 1.22, + "learning_rate": 3.9595764323047494e-07, + "logits/chosen": -1.2914059162139893, + "logits/rejected": -1.2229104042053223, + "logps/chosen": -91.37772369384766, + "logps/rejected": -204.822021484375, + "loss": 0.0845, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.874307632446289, + "rewards/margins": 9.642812728881836, + "rewards/rejected": -11.517120361328125, + "step": 803 + }, + { + "epoch": 1.22, + "learning_rate": 3.94660365002137e-07, + "logits/chosen": -1.113565444946289, + "logits/rejected": -1.1023532152175903, + "logps/chosen": -63.81866455078125, + "logps/rejected": -147.45748901367188, + "loss": 0.0311, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.1928701400756836, + "rewards/margins": 8.079753875732422, + "rewards/rejected": -8.272624015808105, + "step": 804 + }, + { + "epoch": 1.22, + "learning_rate": 3.933638284048331e-07, + "logits/chosen": -1.329550862312317, + "logits/rejected": -1.4320032596588135, + "logps/chosen": -67.80931854248047, + "logps/rejected": -131.1941375732422, + "loss": 0.0468, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.4901169240474701, + "rewards/margins": 6.499993801116943, + "rewards/rejected": -6.990111351013184, + "step": 805 + }, + { + "epoch": 1.22, + "learning_rate": 3.920680425666735e-07, + "logits/chosen": -1.1646313667297363, + "logits/rejected": -1.1623016595840454, + "logps/chosen": -63.63052749633789, + "logps/rejected": -144.2456512451172, + "loss": 0.0449, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.0916926860809326, + "rewards/margins": 6.8504252433776855, + "rewards/rejected": -7.942118167877197, + "step": 806 + }, + { + "epoch": 1.23, + "learning_rate": 3.907730166104827e-07, + "logits/chosen": -1.2554950714111328, + "logits/rejected": -1.1933375597000122, + "logps/chosen": -79.49732971191406, + "logps/rejected": -189.63839721679688, + "loss": 0.03, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.5523549318313599, + "rewards/margins": 10.084354400634766, + "rewards/rejected": -10.636709213256836, + "step": 807 + }, + { + "epoch": 1.23, + "learning_rate": 3.894787596537351e-07, + "logits/chosen": -1.1525908708572388, + "logits/rejected": -1.1488728523254395, + "logps/chosen": -59.99504852294922, + "logps/rejected": -141.78363037109375, + "loss": 0.0271, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.23121660947799683, + "rewards/margins": 7.967068672180176, + "rewards/rejected": -8.198285102844238, + "step": 808 + }, + { + "epoch": 1.23, + "learning_rate": 3.881852808084912e-07, + "logits/chosen": -1.095705270767212, + "logits/rejected": -0.9978544116020203, + "logps/chosen": -59.360530853271484, + "logps/rejected": -158.34080505371094, + "loss": 0.0475, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.3611246347427368, + "rewards/margins": 7.7559919357299805, + "rewards/rejected": -9.117116928100586, + "step": 809 + }, + { + "epoch": 1.23, + "learning_rate": 3.868925891813335e-07, + "logits/chosen": -1.2054696083068848, + "logits/rejected": -1.1077433824539185, + "logps/chosen": -70.31130981445312, + "logps/rejected": -193.58937072753906, + "loss": 0.0494, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.4210721254348755, + "rewards/margins": 9.549147605895996, + "rewards/rejected": -10.970220565795898, + "step": 810 + }, + { + "epoch": 1.23, + "learning_rate": 3.856006938733016e-07, + "logits/chosen": -1.473289966583252, + "logits/rejected": -1.4685642719268799, + "logps/chosen": -50.909339904785156, + "logps/rejected": -128.1400909423828, + "loss": 0.0311, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.2504890561103821, + "rewards/margins": 7.203729152679443, + "rewards/rejected": -6.953239440917969, + "step": 811 + }, + { + "epoch": 1.23, + "learning_rate": 3.8430960397982926e-07, + "logits/chosen": -1.1097276210784912, + "logits/rejected": -1.0613540410995483, + "logps/chosen": -91.93956756591797, + "logps/rejected": -215.58001708984375, + "loss": 0.0166, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.9927880764007568, + "rewards/margins": 11.322046279907227, + "rewards/rejected": -12.314833641052246, + "step": 812 + }, + { + "epoch": 1.24, + "learning_rate": 3.830193285906796e-07, + "logits/chosen": -1.081833839416504, + "logits/rejected": -1.1834239959716797, + "logps/chosen": -47.86272048950195, + "logps/rejected": -114.7242431640625, + "loss": 0.0241, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.044774383306503296, + "rewards/margins": 6.425144672393799, + "rewards/rejected": -6.469919204711914, + "step": 813 + }, + { + "epoch": 1.24, + "learning_rate": 3.817298767898816e-07, + "logits/chosen": -1.0170581340789795, + "logits/rejected": -0.9848339557647705, + "logps/chosen": -74.92778015136719, + "logps/rejected": -209.4849853515625, + "loss": 0.0285, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.8389643430709839, + "rewards/margins": 11.954238891601562, + "rewards/rejected": -12.793205261230469, + "step": 814 + }, + { + "epoch": 1.24, + "learning_rate": 3.804412576556652e-07, + "logits/chosen": -1.427770972251892, + "logits/rejected": -1.3738195896148682, + "logps/chosen": -62.669166564941406, + "logps/rejected": -155.71170043945312, + "loss": 0.0534, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.2841857075691223, + "rewards/margins": 8.41563606262207, + "rewards/rejected": -8.699822425842285, + "step": 815 + }, + { + "epoch": 1.24, + "learning_rate": 3.791534802603987e-07, + "logits/chosen": -1.2190669775009155, + "logits/rejected": -1.1652584075927734, + "logps/chosen": -52.3328971862793, + "logps/rejected": -121.18106079101562, + "loss": 0.0304, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.26607978343963623, + "rewards/margins": 6.681756019592285, + "rewards/rejected": -6.947835445404053, + "step": 816 + }, + { + "epoch": 1.24, + "learning_rate": 3.778665536705242e-07, + "logits/chosen": -1.4151263236999512, + "logits/rejected": -1.3526246547698975, + "logps/chosen": -58.03881072998047, + "logps/rejected": -131.90274047851562, + "loss": 0.0255, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.752575159072876, + "rewards/margins": 6.559599876403809, + "rewards/rejected": -7.312175750732422, + "step": 817 + }, + { + "epoch": 1.24, + "learning_rate": 3.765804869464932e-07, + "logits/chosen": -1.149936318397522, + "logits/rejected": -1.1367430686950684, + "logps/chosen": -75.39720916748047, + "logps/rejected": -209.57176208496094, + "loss": 0.0389, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.0461437702178955, + "rewards/margins": 11.42244815826416, + "rewards/rejected": -12.468591690063477, + "step": 818 + }, + { + "epoch": 1.24, + "learning_rate": 3.75295289142704e-07, + "logits/chosen": -0.8759934902191162, + "logits/rejected": -0.82037353515625, + "logps/chosen": -57.84587860107422, + "logps/rejected": -180.59344482421875, + "loss": 0.0417, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.02280382812023163, + "rewards/margins": 10.916155815124512, + "rewards/rejected": -10.938959121704102, + "step": 819 + }, + { + "epoch": 1.25, + "learning_rate": 3.7401096930743746e-07, + "logits/chosen": -1.0861873626708984, + "logits/rejected": -1.0199942588806152, + "logps/chosen": -77.1715316772461, + "logps/rejected": -179.63955688476562, + "loss": 0.0446, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.759091854095459, + "rewards/margins": 9.110944747924805, + "rewards/rejected": -9.870037078857422, + "step": 820 + }, + { + "epoch": 1.25, + "learning_rate": 3.727275364827926e-07, + "logits/chosen": -1.1666381359100342, + "logits/rejected": -1.1186622381210327, + "logps/chosen": -66.40373229980469, + "logps/rejected": -203.10476684570312, + "loss": 0.0235, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.6987287402153015, + "rewards/margins": 11.257447242736816, + "rewards/rejected": -11.9561767578125, + "step": 821 + }, + { + "epoch": 1.25, + "learning_rate": 3.714449997046241e-07, + "logits/chosen": -1.1445003747940063, + "logits/rejected": -1.2067350149154663, + "logps/chosen": -82.43601989746094, + "logps/rejected": -173.05908203125, + "loss": 0.0244, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.9472718238830566, + "rewards/margins": 9.079296112060547, + "rewards/rejected": -11.026567459106445, + "step": 822 + }, + { + "epoch": 1.25, + "learning_rate": 3.7016336800247775e-07, + "logits/chosen": -1.030447244644165, + "logits/rejected": -0.9534213542938232, + "logps/chosen": -78.53152465820312, + "logps/rejected": -185.37857055664062, + "loss": 0.0293, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.0761345624923706, + "rewards/margins": 10.263667106628418, + "rewards/rejected": -11.339801788330078, + "step": 823 + }, + { + "epoch": 1.25, + "learning_rate": 3.6888265039952795e-07, + "logits/chosen": -1.092881202697754, + "logits/rejected": -0.9708372950553894, + "logps/chosen": -63.030845642089844, + "logps/rejected": -187.64028930664062, + "loss": 0.059, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.619552493095398, + "rewards/margins": 10.110761642456055, + "rewards/rejected": -10.730313301086426, + "step": 824 + }, + { + "epoch": 1.25, + "learning_rate": 3.6760285591251226e-07, + "logits/chosen": -0.9170098900794983, + "logits/rejected": -0.921739399433136, + "logps/chosen": -59.816246032714844, + "logps/rejected": -146.72691345214844, + "loss": 0.0377, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.2829318344593048, + "rewards/margins": 8.726126670837402, + "rewards/rejected": -9.009058952331543, + "step": 825 + }, + { + "epoch": 1.25, + "learning_rate": 3.663239935516704e-07, + "logits/chosen": -1.335066318511963, + "logits/rejected": -1.2408074140548706, + "logps/chosen": -78.4832992553711, + "logps/rejected": -193.43527221679688, + "loss": 0.0194, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.1176406145095825, + "rewards/margins": 8.863214492797852, + "rewards/rejected": -9.980854034423828, + "step": 826 + }, + { + "epoch": 1.26, + "learning_rate": 3.650460723206791e-07, + "logits/chosen": -1.0009205341339111, + "logits/rejected": -0.9793943762779236, + "logps/chosen": -64.65348052978516, + "logps/rejected": -153.94056701660156, + "loss": 0.0406, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.0533581972122192, + "rewards/margins": 8.320476531982422, + "rewards/rejected": -9.373835563659668, + "step": 827 + }, + { + "epoch": 1.26, + "learning_rate": 3.637691012165886e-07, + "logits/chosen": -1.1569173336029053, + "logits/rejected": -1.1553266048431396, + "logps/chosen": -69.02538299560547, + "logps/rejected": -170.2616424560547, + "loss": 0.0614, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.2905741333961487, + "rewards/margins": 10.302106857299805, + "rewards/rejected": -10.592680931091309, + "step": 828 + }, + { + "epoch": 1.26, + "learning_rate": 3.6249308922976086e-07, + "logits/chosen": -1.0629199743270874, + "logits/rejected": -1.1610177755355835, + "logps/chosen": -64.80350494384766, + "logps/rejected": -153.3122100830078, + "loss": 0.0412, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.4723438024520874, + "rewards/margins": 8.992136001586914, + "rewards/rejected": -9.464479446411133, + "step": 829 + }, + { + "epoch": 1.26, + "learning_rate": 3.6121804534380496e-07, + "logits/chosen": -0.9190477132797241, + "logits/rejected": -0.8003554940223694, + "logps/chosen": -71.7896957397461, + "logps/rejected": -185.8133544921875, + "loss": 0.055, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.156874656677246, + "rewards/margins": 9.655036926269531, + "rewards/rejected": -10.811910629272461, + "step": 830 + }, + { + "epoch": 1.26, + "learning_rate": 3.5994397853551356e-07, + "logits/chosen": -1.0027974843978882, + "logits/rejected": -0.952623724937439, + "logps/chosen": -83.22335815429688, + "logps/rejected": -215.75918579101562, + "loss": 0.012, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.064950704574585, + "rewards/margins": 11.617621421813965, + "rewards/rejected": -12.682572364807129, + "step": 831 + }, + { + "epoch": 1.26, + "learning_rate": 3.586708977748012e-07, + "logits/chosen": -1.0339128971099854, + "logits/rejected": -1.0624173879623413, + "logps/chosen": -60.685386657714844, + "logps/rejected": -165.6641082763672, + "loss": 0.0643, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.003927305340766907, + "rewards/margins": 9.649250030517578, + "rewards/rejected": -9.653176307678223, + "step": 832 + }, + { + "epoch": 1.27, + "learning_rate": 3.5739881202463975e-07, + "logits/chosen": -0.8943543434143066, + "logits/rejected": -0.9420751333236694, + "logps/chosen": -72.37063598632812, + "logps/rejected": -166.33132934570312, + "loss": 0.0425, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.4235678911209106, + "rewards/margins": 8.860400199890137, + "rewards/rejected": -10.28396987915039, + "step": 833 + }, + { + "epoch": 1.27, + "learning_rate": 3.561277302409962e-07, + "logits/chosen": -1.1255451440811157, + "logits/rejected": -1.038440465927124, + "logps/chosen": -81.85516357421875, + "logps/rejected": -203.66510009765625, + "loss": 0.0385, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.0208683013916016, + "rewards/margins": 10.475278854370117, + "rewards/rejected": -12.496146202087402, + "step": 834 + }, + { + "epoch": 1.27, + "learning_rate": 3.548576613727689e-07, + "logits/chosen": -1.1847028732299805, + "logits/rejected": -1.1377449035644531, + "logps/chosen": -71.52493286132812, + "logps/rejected": -189.09942626953125, + "loss": 0.0153, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.7599396705627441, + "rewards/margins": 10.621832847595215, + "rewards/rejected": -11.3817720413208, + "step": 835 + }, + { + "epoch": 1.27, + "learning_rate": 3.535886143617248e-07, + "logits/chosen": -1.2053256034851074, + "logits/rejected": -1.1074981689453125, + "logps/chosen": -64.15154266357422, + "logps/rejected": -173.67425537109375, + "loss": 0.013, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.6199431419372559, + "rewards/margins": 9.468947410583496, + "rewards/rejected": -10.08889102935791, + "step": 836 + }, + { + "epoch": 1.27, + "learning_rate": 3.5232059814243713e-07, + "logits/chosen": -1.1888346672058105, + "logits/rejected": -1.097103476524353, + "logps/chosen": -55.30160140991211, + "logps/rejected": -211.38125610351562, + "loss": 0.0388, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.3952859938144684, + "rewards/margins": 12.751258850097656, + "rewards/rejected": -12.355973243713379, + "step": 837 + }, + { + "epoch": 1.27, + "learning_rate": 3.510536216422213e-07, + "logits/chosen": -1.2884656190872192, + "logits/rejected": -1.2741146087646484, + "logps/chosen": -62.34508514404297, + "logps/rejected": -171.33905029296875, + "loss": 0.0475, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.272865891456604, + "rewards/margins": 9.452814102172852, + "rewards/rejected": -9.725680351257324, + "step": 838 + }, + { + "epoch": 1.27, + "learning_rate": 3.497876937810732e-07, + "logits/chosen": -1.2582658529281616, + "logits/rejected": -1.2361618280410767, + "logps/chosen": -70.01296997070312, + "logps/rejected": -195.9732666015625, + "loss": 0.0218, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.9307251572608948, + "rewards/margins": 11.011921882629395, + "rewards/rejected": -11.942646026611328, + "step": 839 + }, + { + "epoch": 1.28, + "learning_rate": 3.485228234716058e-07, + "logits/chosen": -1.075059175491333, + "logits/rejected": -1.038475513458252, + "logps/chosen": -72.45191955566406, + "logps/rejected": -187.55517578125, + "loss": 0.019, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.6160510778427124, + "rewards/margins": 9.863422393798828, + "rewards/rejected": -10.479473114013672, + "step": 840 + }, + { + "epoch": 1.28, + "learning_rate": 3.472590196189864e-07, + "logits/chosen": -1.0447841882705688, + "logits/rejected": -1.0268189907073975, + "logps/chosen": -75.5906753540039, + "logps/rejected": -186.53822326660156, + "loss": 0.0154, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.042884960770606995, + "rewards/margins": 10.343118667602539, + "rewards/rejected": -10.386002540588379, + "step": 841 + }, + { + "epoch": 1.28, + "learning_rate": 3.459962911208738e-07, + "logits/chosen": -1.0275884866714478, + "logits/rejected": -0.9557711482048035, + "logps/chosen": -76.58280181884766, + "logps/rejected": -199.26832580566406, + "loss": 0.0165, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.4326503276824951, + "rewards/margins": 11.00889778137207, + "rewards/rejected": -12.441548347473145, + "step": 842 + }, + { + "epoch": 1.28, + "learning_rate": 3.447346468673563e-07, + "logits/chosen": -1.1951040029525757, + "logits/rejected": -1.0923504829406738, + "logps/chosen": -84.25811004638672, + "logps/rejected": -200.55804443359375, + "loss": 0.0295, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.4290473461151123, + "rewards/margins": 9.567483901977539, + "rewards/rejected": -10.99653148651123, + "step": 843 + }, + { + "epoch": 1.28, + "learning_rate": 3.4347409574088894e-07, + "logits/chosen": -0.8634222149848938, + "logits/rejected": -0.7933879494667053, + "logps/chosen": -41.86826705932617, + "logps/rejected": -135.14114379882812, + "loss": 0.0248, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.08744090050458908, + "rewards/margins": 8.024269104003906, + "rewards/rejected": -8.111709594726562, + "step": 844 + }, + { + "epoch": 1.28, + "learning_rate": 3.4221464661622977e-07, + "logits/chosen": -1.0899560451507568, + "logits/rejected": -1.0018041133880615, + "logps/chosen": -67.12272644042969, + "logps/rejected": -200.95716857910156, + "loss": 0.0339, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.1569205522537231, + "rewards/margins": 11.434322357177734, + "rewards/rejected": -12.591243743896484, + "step": 845 + }, + { + "epoch": 1.29, + "learning_rate": 3.409563083603793e-07, + "logits/chosen": -1.3234093189239502, + "logits/rejected": -1.3561903238296509, + "logps/chosen": -81.11517333984375, + "logps/rejected": -161.14039611816406, + "loss": 0.0175, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.16130530834198, + "rewards/margins": 7.4840898513793945, + "rewards/rejected": -8.645394325256348, + "step": 846 + }, + { + "epoch": 1.29, + "learning_rate": 3.396990898325166e-07, + "logits/chosen": -1.1908986568450928, + "logits/rejected": -1.0249378681182861, + "logps/chosen": -75.77396392822266, + "logps/rejected": -279.65606689453125, + "loss": 0.0167, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.8964182734489441, + "rewards/margins": 15.995891571044922, + "rewards/rejected": -16.892311096191406, + "step": 847 + }, + { + "epoch": 1.29, + "learning_rate": 3.384429998839375e-07, + "logits/chosen": -1.1977099180221558, + "logits/rejected": -1.2463692426681519, + "logps/chosen": -65.84039306640625, + "logps/rejected": -143.89614868164062, + "loss": 0.0359, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.6831881999969482, + "rewards/margins": 8.220685958862305, + "rewards/rejected": -8.903874397277832, + "step": 848 + }, + { + "epoch": 1.29, + "learning_rate": 3.3718804735799225e-07, + "logits/chosen": -1.318107008934021, + "logits/rejected": -1.2315919399261475, + "logps/chosen": -71.2174072265625, + "logps/rejected": -191.1319122314453, + "loss": 0.0429, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.7054893970489502, + "rewards/margins": 10.151110649108887, + "rewards/rejected": -10.856599807739258, + "step": 849 + }, + { + "epoch": 1.29, + "learning_rate": 3.3593424109002335e-07, + "logits/chosen": -1.2710603475570679, + "logits/rejected": -1.2152063846588135, + "logps/chosen": -61.5471076965332, + "logps/rejected": -159.23065185546875, + "loss": 0.0321, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.08005739003419876, + "rewards/margins": 8.800928115844727, + "rewards/rejected": -8.880986213684082, + "step": 850 + }, + { + "epoch": 1.29, + "learning_rate": 3.34681589907303e-07, + "logits/chosen": -1.0689845085144043, + "logits/rejected": -1.0591791868209839, + "logps/chosen": -65.96229553222656, + "logps/rejected": -159.93722534179688, + "loss": 0.0128, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.3127668499946594, + "rewards/margins": 8.901169776916504, + "rewards/rejected": -9.213937759399414, + "step": 851 + }, + { + "epoch": 1.29, + "learning_rate": 3.334301026289712e-07, + "logits/chosen": -1.0889052152633667, + "logits/rejected": -1.119452714920044, + "logps/chosen": -85.632568359375, + "logps/rejected": -188.4246063232422, + "loss": 0.0295, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.2408623695373535, + "rewards/margins": 9.83039665222168, + "rewards/rejected": -11.071259498596191, + "step": 852 + }, + { + "epoch": 1.3, + "learning_rate": 3.321797880659737e-07, + "logits/chosen": -1.0406701564788818, + "logits/rejected": -1.022117257118225, + "logps/chosen": -59.495052337646484, + "logps/rejected": -135.61944580078125, + "loss": 0.051, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.4191983640193939, + "rewards/margins": 7.46809196472168, + "rewards/rejected": -7.887290954589844, + "step": 853 + }, + { + "epoch": 1.3, + "learning_rate": 3.309306550209999e-07, + "logits/chosen": -1.2750275135040283, + "logits/rejected": -1.2143597602844238, + "logps/chosen": -62.65305709838867, + "logps/rejected": -146.83177185058594, + "loss": 0.0551, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.5164964199066162, + "rewards/margins": 7.442717552185059, + "rewards/rejected": -7.959214210510254, + "step": 854 + }, + { + "epoch": 1.3, + "learning_rate": 3.296827122884207e-07, + "logits/chosen": -1.1609838008880615, + "logits/rejected": -1.0337468385696411, + "logps/chosen": -98.77885437011719, + "logps/rejected": -226.49143981933594, + "loss": 0.0409, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.982264518737793, + "rewards/margins": 11.136384963989258, + "rewards/rejected": -13.118648529052734, + "step": 855 + }, + { + "epoch": 1.3, + "learning_rate": 3.2843596865422684e-07, + "logits/chosen": -1.2014923095703125, + "logits/rejected": -1.1845405101776123, + "logps/chosen": -52.282691955566406, + "logps/rejected": -130.2438201904297, + "loss": 0.0311, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.42395687103271484, + "rewards/margins": 7.3297038078308105, + "rewards/rejected": -7.753661155700684, + "step": 856 + }, + { + "epoch": 1.3, + "learning_rate": 3.271904328959672e-07, + "logits/chosen": -1.0705763101577759, + "logits/rejected": -1.0866658687591553, + "logps/chosen": -82.63775634765625, + "logps/rejected": -174.16571044921875, + "loss": 0.0353, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.7364760637283325, + "rewards/margins": 8.609589576721191, + "rewards/rejected": -10.346065521240234, + "step": 857 + }, + { + "epoch": 1.3, + "learning_rate": 3.2594611378268614e-07, + "logits/chosen": -1.2007231712341309, + "logits/rejected": -1.2071821689605713, + "logps/chosen": -67.26966094970703, + "logps/rejected": -188.78158569335938, + "loss": 0.0068, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.009529948234558, + "rewards/margins": 10.658390045166016, + "rewards/rejected": -11.667919158935547, + "step": 858 + }, + { + "epoch": 1.3, + "learning_rate": 3.2470302007486303e-07, + "logits/chosen": -1.2172272205352783, + "logits/rejected": -1.1473592519760132, + "logps/chosen": -58.447723388671875, + "logps/rejected": -184.71939086914062, + "loss": 0.0256, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.5823120474815369, + "rewards/margins": 10.512475967407227, + "rewards/rejected": -11.094788551330566, + "step": 859 + }, + { + "epoch": 1.31, + "learning_rate": 3.234611605243496e-07, + "logits/chosen": -1.0577294826507568, + "logits/rejected": -1.0574690103530884, + "logps/chosen": -66.60233306884766, + "logps/rejected": -186.69564819335938, + "loss": 0.0199, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.5992780327796936, + "rewards/margins": 10.619293212890625, + "rewards/rejected": -11.218571662902832, + "step": 860 + }, + { + "epoch": 1.31, + "learning_rate": 3.222205438743089e-07, + "logits/chosen": -0.9650871753692627, + "logits/rejected": -0.9335703253746033, + "logps/chosen": -94.79300689697266, + "logps/rejected": -228.11412048339844, + "loss": 0.0079, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.3178277015686035, + "rewards/margins": 12.129016876220703, + "rewards/rejected": -14.446845054626465, + "step": 861 + }, + { + "epoch": 1.31, + "learning_rate": 3.2098117885915276e-07, + "logits/chosen": -1.1251049041748047, + "logits/rejected": -1.097298264503479, + "logps/chosen": -62.1351318359375, + "logps/rejected": -166.1202392578125, + "loss": 0.0316, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.5515062212944031, + "rewards/margins": 9.523969650268555, + "rewards/rejected": -10.075475692749023, + "step": 862 + }, + { + "epoch": 1.31, + "learning_rate": 3.1974307420448197e-07, + "logits/chosen": -1.0989717245101929, + "logits/rejected": -1.1117409467697144, + "logps/chosen": -63.93228530883789, + "logps/rejected": -137.04348754882812, + "loss": 0.0139, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.0571870803833008, + "rewards/margins": 7.001034736633301, + "rewards/rejected": -8.058222770690918, + "step": 863 + }, + { + "epoch": 1.31, + "learning_rate": 3.185062386270234e-07, + "logits/chosen": -1.2424949407577515, + "logits/rejected": -1.1999640464782715, + "logps/chosen": -70.60078430175781, + "logps/rejected": -175.35520935058594, + "loss": 0.0393, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.8002070188522339, + "rewards/margins": 9.057512283325195, + "rewards/rejected": -9.857719421386719, + "step": 864 + }, + { + "epoch": 1.31, + "learning_rate": 3.172706808345692e-07, + "logits/chosen": -1.1185640096664429, + "logits/rejected": -1.1533507108688354, + "logps/chosen": -77.15667724609375, + "logps/rejected": -178.48419189453125, + "loss": 0.0548, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.9563284516334534, + "rewards/margins": 10.11512565612793, + "rewards/rejected": -11.071453094482422, + "step": 865 + }, + { + "epoch": 1.32, + "learning_rate": 3.1603640952591536e-07, + "logits/chosen": -1.3847885131835938, + "logits/rejected": -1.2616667747497559, + "logps/chosen": -63.544517517089844, + "logps/rejected": -188.02212524414062, + "loss": 0.0401, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.1738569736480713, + "rewards/margins": 10.61096477508545, + "rewards/rejected": -11.784821510314941, + "step": 866 + }, + { + "epoch": 1.32, + "learning_rate": 3.1480343339080094e-07, + "logits/chosen": -1.1350643634796143, + "logits/rejected": -1.117767333984375, + "logps/chosen": -51.981014251708984, + "logps/rejected": -133.83802795410156, + "loss": 0.0264, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.2095465511083603, + "rewards/margins": 7.825038909912109, + "rewards/rejected": -8.034584999084473, + "step": 867 + }, + { + "epoch": 1.32, + "learning_rate": 3.135717611098457e-07, + "logits/chosen": -1.0594005584716797, + "logits/rejected": -1.1397340297698975, + "logps/chosen": -70.15467834472656, + "logps/rejected": -154.9226531982422, + "loss": 0.0224, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.378109872341156, + "rewards/margins": 9.015453338623047, + "rewards/rejected": -9.393564224243164, + "step": 868 + }, + { + "epoch": 1.32, + "learning_rate": 3.123414013544905e-07, + "logits/chosen": -1.168516755104065, + "logits/rejected": -1.1186997890472412, + "logps/chosen": -72.56488800048828, + "logps/rejected": -190.87008666992188, + "loss": 0.0105, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.64239501953125, + "rewards/margins": 10.973026275634766, + "rewards/rejected": -11.615421295166016, + "step": 869 + }, + { + "epoch": 1.32, + "learning_rate": 3.1111236278693525e-07, + "logits/chosen": -1.1072605848312378, + "logits/rejected": -1.0726549625396729, + "logps/chosen": -76.71082305908203, + "logps/rejected": -182.40530395507812, + "loss": 0.0208, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.745031476020813, + "rewards/margins": 9.732573509216309, + "rewards/rejected": -11.477605819702148, + "step": 870 + }, + { + "epoch": 1.32, + "learning_rate": 3.0988465406007837e-07, + "logits/chosen": -1.0790177583694458, + "logits/rejected": -0.9793330430984497, + "logps/chosen": -68.66360473632812, + "logps/rejected": -213.48367309570312, + "loss": 0.0141, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.07770824432373047, + "rewards/margins": 12.449033737182617, + "rewards/rejected": -12.526741027832031, + "step": 871 + }, + { + "epoch": 1.32, + "learning_rate": 3.086582838174551e-07, + "logits/chosen": -1.2810640335083008, + "logits/rejected": -1.2028204202651978, + "logps/chosen": -67.54934692382812, + "logps/rejected": -183.29315185546875, + "loss": 0.0189, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.3115649223327637, + "rewards/margins": 10.0007963180542, + "rewards/rejected": -11.312360763549805, + "step": 872 + }, + { + "epoch": 1.33, + "learning_rate": 3.07433260693178e-07, + "logits/chosen": -1.0981544256210327, + "logits/rejected": -1.101778268814087, + "logps/chosen": -69.46454620361328, + "logps/rejected": -196.09434509277344, + "loss": 0.0201, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.0734939575195312, + "rewards/margins": 11.262301445007324, + "rewards/rejected": -12.335796356201172, + "step": 873 + }, + { + "epoch": 1.33, + "learning_rate": 3.062095933118752e-07, + "logits/chosen": -1.1423189640045166, + "logits/rejected": -1.0814754962921143, + "logps/chosen": -68.90902709960938, + "logps/rejected": -190.8826904296875, + "loss": 0.0593, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.6413151025772095, + "rewards/margins": 11.242748260498047, + "rewards/rejected": -11.884064674377441, + "step": 874 + }, + { + "epoch": 1.33, + "learning_rate": 3.0498729028862933e-07, + "logits/chosen": -1.263510823249817, + "logits/rejected": -1.2098865509033203, + "logps/chosen": -98.67642974853516, + "logps/rejected": -226.28778076171875, + "loss": 0.0203, + "rewards/accuracies": 0.9375, + "rewards/chosen": -3.4813520908355713, + "rewards/margins": 11.984416007995605, + "rewards/rejected": -15.46576976776123, + "step": 875 + }, + { + "epoch": 1.33, + "learning_rate": 3.037663602289181e-07, + "logits/chosen": -1.1308614015579224, + "logits/rejected": -1.0499013662338257, + "logps/chosen": -64.38939666748047, + "logps/rejected": -198.14913940429688, + "loss": 0.0504, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.9843230843544006, + "rewards/margins": 10.785235404968262, + "rewards/rejected": -11.76955795288086, + "step": 876 + }, + { + "epoch": 1.33, + "learning_rate": 3.025468117285529e-07, + "logits/chosen": -1.167764663696289, + "logits/rejected": -1.0539222955703735, + "logps/chosen": -68.39847564697266, + "logps/rejected": -165.08724975585938, + "loss": 0.0197, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.7871071100234985, + "rewards/margins": 7.700448036193848, + "rewards/rejected": -9.487555503845215, + "step": 877 + }, + { + "epoch": 1.33, + "learning_rate": 3.013286533736183e-07, + "logits/chosen": -1.1149120330810547, + "logits/rejected": -0.9907094836235046, + "logps/chosen": -81.09086608886719, + "logps/rejected": -223.73294067382812, + "loss": 0.0289, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.606088161468506, + "rewards/margins": 11.589728355407715, + "rewards/rejected": -14.195816993713379, + "step": 878 + }, + { + "epoch": 1.34, + "learning_rate": 3.0011189374041145e-07, + "logits/chosen": -0.905259370803833, + "logits/rejected": -0.8425391316413879, + "logps/chosen": -62.18824768066406, + "logps/rejected": -181.83108520507812, + "loss": 0.0422, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.1463936567306519, + "rewards/margins": 10.523120880126953, + "rewards/rejected": -11.669514656066895, + "step": 879 + }, + { + "epoch": 1.34, + "learning_rate": 2.9889654139538244e-07, + "logits/chosen": -1.012556552886963, + "logits/rejected": -1.1096618175506592, + "logps/chosen": -42.681976318359375, + "logps/rejected": -131.0863037109375, + "loss": 0.0166, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.7648807764053345, + "rewards/margins": 9.021519660949707, + "rewards/rejected": -8.25663948059082, + "step": 880 + }, + { + "epoch": 1.34, + "learning_rate": 2.9768260489507335e-07, + "logits/chosen": -1.2592908143997192, + "logits/rejected": -1.1268333196640015, + "logps/chosen": -73.65986633300781, + "logps/rejected": -220.07272338867188, + "loss": 0.0638, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.2277494668960571, + "rewards/margins": 11.876537322998047, + "rewards/rejected": -13.104288101196289, + "step": 881 + }, + { + "epoch": 1.34, + "learning_rate": 2.9647009278605803e-07, + "logits/chosen": -1.167679786682129, + "logits/rejected": -1.0784720182418823, + "logps/chosen": -67.72370910644531, + "logps/rejected": -196.77841186523438, + "loss": 0.0188, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.8135742545127869, + "rewards/margins": 11.023904800415039, + "rewards/rejected": -11.837478637695312, + "step": 882 + }, + { + "epoch": 1.34, + "learning_rate": 2.9525901360488235e-07, + "logits/chosen": -0.9997549057006836, + "logits/rejected": -0.9432014226913452, + "logps/chosen": -74.05624389648438, + "logps/rejected": -158.91029357910156, + "loss": 0.0314, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.751084804534912, + "rewards/margins": 8.212854385375977, + "rewards/rejected": -9.963939666748047, + "step": 883 + }, + { + "epoch": 1.34, + "learning_rate": 2.940493758780037e-07, + "logits/chosen": -1.043877363204956, + "logits/rejected": -0.949622631072998, + "logps/chosen": -71.61334228515625, + "logps/rejected": -199.0709686279297, + "loss": 0.0052, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.8465366363525391, + "rewards/margins": 11.850892066955566, + "rewards/rejected": -12.697427749633789, + "step": 884 + }, + { + "epoch": 1.34, + "learning_rate": 2.9284118812173085e-07, + "logits/chosen": -1.1592051982879639, + "logits/rejected": -1.1231443881988525, + "logps/chosen": -66.36738586425781, + "logps/rejected": -169.8032989501953, + "loss": 0.0208, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.8645541667938232, + "rewards/margins": 9.401374816894531, + "rewards/rejected": -10.265928268432617, + "step": 885 + }, + { + "epoch": 1.35, + "learning_rate": 2.916344588421645e-07, + "logits/chosen": -1.2972654104232788, + "logits/rejected": -1.255252718925476, + "logps/chosen": -93.02581787109375, + "logps/rejected": -201.748046875, + "loss": 0.023, + "rewards/accuracies": 0.9375, + "rewards/chosen": -2.3491549491882324, + "rewards/margins": 9.73039436340332, + "rewards/rejected": -12.079549789428711, + "step": 886 + }, + { + "epoch": 1.35, + "learning_rate": 2.904291965351369e-07, + "logits/chosen": -1.4914642572402954, + "logits/rejected": -1.396189570426941, + "logps/chosen": -53.96504211425781, + "logps/rejected": -151.813232421875, + "loss": 0.0586, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.09026004374027252, + "rewards/margins": 8.757599830627441, + "rewards/rejected": -8.847860336303711, + "step": 887 + }, + { + "epoch": 1.35, + "learning_rate": 2.8922540968615283e-07, + "logits/chosen": -1.1544334888458252, + "logits/rejected": -1.1415927410125732, + "logps/chosen": -76.50936889648438, + "logps/rejected": -199.18942260742188, + "loss": 0.0466, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.2693109512329102, + "rewards/margins": 11.33557415008545, + "rewards/rejected": -12.60488510131836, + "step": 888 + }, + { + "epoch": 1.35, + "learning_rate": 2.880231067703285e-07, + "logits/chosen": -1.0120527744293213, + "logits/rejected": -0.9444929361343384, + "logps/chosen": -56.10468292236328, + "logps/rejected": -187.27781677246094, + "loss": 0.0103, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.14829185605049133, + "rewards/margins": 11.569108009338379, + "rewards/rejected": -11.717399597167969, + "step": 889 + }, + { + "epoch": 1.35, + "learning_rate": 2.8682229625233296e-07, + "logits/chosen": -1.0918954610824585, + "logits/rejected": -1.128036618232727, + "logps/chosen": -60.16938018798828, + "logps/rejected": -146.3104248046875, + "loss": 0.0249, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.7777913808822632, + "rewards/margins": 8.487791061401367, + "rewards/rejected": -9.265583038330078, + "step": 890 + }, + { + "epoch": 1.35, + "learning_rate": 2.856229865863288e-07, + "logits/chosen": -1.184349536895752, + "logits/rejected": -1.1841895580291748, + "logps/chosen": -66.1183853149414, + "logps/rejected": -151.86642456054688, + "loss": 0.0271, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.2216904163360596, + "rewards/margins": 8.370936393737793, + "rewards/rejected": -9.592626571655273, + "step": 891 + }, + { + "epoch": 1.36, + "learning_rate": 2.8442518621591084e-07, + "logits/chosen": -1.2903201580047607, + "logits/rejected": -1.2698955535888672, + "logps/chosen": -64.32032012939453, + "logps/rejected": -158.66183471679688, + "loss": 0.0325, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.156549096107483, + "rewards/margins": 8.878497123718262, + "rewards/rejected": -10.035046577453613, + "step": 892 + }, + { + "epoch": 1.36, + "learning_rate": 2.8322890357404907e-07, + "logits/chosen": -1.0020402669906616, + "logits/rejected": -0.8951206803321838, + "logps/chosen": -68.79203033447266, + "logps/rejected": -186.751953125, + "loss": 0.0406, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.653872549533844, + "rewards/margins": 10.672158241271973, + "rewards/rejected": -11.326029777526855, + "step": 893 + }, + { + "epoch": 1.36, + "learning_rate": 2.820341470830273e-07, + "logits/chosen": -1.3208019733428955, + "logits/rejected": -1.3319730758666992, + "logps/chosen": -58.48689651489258, + "logps/rejected": -175.47265625, + "loss": 0.0121, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.8185827136039734, + "rewards/margins": 10.243091583251953, + "rewards/rejected": -11.061674118041992, + "step": 894 + }, + { + "epoch": 1.36, + "learning_rate": 2.808409251543852e-07, + "logits/chosen": -1.2520232200622559, + "logits/rejected": -1.1039667129516602, + "logps/chosen": -88.31317138671875, + "logps/rejected": -239.4126739501953, + "loss": 0.01, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.2212610244750977, + "rewards/margins": 12.960351943969727, + "rewards/rejected": -15.18161392211914, + "step": 895 + }, + { + "epoch": 1.36, + "learning_rate": 2.7964924618885776e-07, + "logits/chosen": -1.0469504594802856, + "logits/rejected": -1.0014100074768066, + "logps/chosen": -54.08551025390625, + "logps/rejected": -160.48800659179688, + "loss": 0.0387, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.3419005870819092, + "rewards/margins": 9.732646942138672, + "rewards/rejected": -10.07454776763916, + "step": 896 + }, + { + "epoch": 1.36, + "learning_rate": 2.784591185763182e-07, + "logits/chosen": -1.2565423250198364, + "logits/rejected": -1.2446802854537964, + "logps/chosen": -61.27532958984375, + "logps/rejected": -143.79840087890625, + "loss": 0.0445, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.73479825258255, + "rewards/margins": 7.8365888595581055, + "rewards/rejected": -8.571386337280273, + "step": 897 + }, + { + "epoch": 1.36, + "learning_rate": 2.772705506957164e-07, + "logits/chosen": -1.2247684001922607, + "logits/rejected": -1.2073692083358765, + "logps/chosen": -89.75728607177734, + "logps/rejected": -168.08990478515625, + "loss": 0.0496, + "rewards/accuracies": 0.9375, + "rewards/chosen": -2.443594217300415, + "rewards/margins": 7.36766242980957, + "rewards/rejected": -9.811256408691406, + "step": 898 + }, + { + "epoch": 1.37, + "learning_rate": 2.760835509150218e-07, + "logits/chosen": -1.235434651374817, + "logits/rejected": -1.2565488815307617, + "logps/chosen": -97.0019760131836, + "logps/rejected": -203.24266052246094, + "loss": 0.0204, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.119580030441284, + "rewards/margins": 10.19863510131836, + "rewards/rejected": -12.318215370178223, + "step": 899 + }, + { + "epoch": 1.37, + "learning_rate": 2.748981275911633e-07, + "logits/chosen": -1.2976423501968384, + "logits/rejected": -1.2595832347869873, + "logps/chosen": -89.51211547851562, + "logps/rejected": -189.83468627929688, + "loss": 0.0141, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.3441162109375, + "rewards/margins": 9.426911354064941, + "rewards/rejected": -11.771027565002441, + "step": 900 + }, + { + "epoch": 1.37, + "learning_rate": 2.737142890699717e-07, + "logits/chosen": -1.294965386390686, + "logits/rejected": -1.2420074939727783, + "logps/chosen": -71.51050567626953, + "logps/rejected": -161.4373016357422, + "loss": 0.0088, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.2380168437957764, + "rewards/margins": 8.606657028198242, + "rewards/rejected": -9.844674110412598, + "step": 901 + }, + { + "epoch": 1.37, + "learning_rate": 2.725320436861197e-07, + "logits/chosen": -0.9856861233711243, + "logits/rejected": -0.8646342158317566, + "logps/chosen": -92.23006439208984, + "logps/rejected": -200.3787841796875, + "loss": 0.0153, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.5142266750335693, + "rewards/margins": 8.967628479003906, + "rewards/rejected": -12.481855392456055, + "step": 902 + }, + { + "epoch": 1.37, + "learning_rate": 2.7135139976306344e-07, + "logits/chosen": -1.17044198513031, + "logits/rejected": -1.1947274208068848, + "logps/chosen": -93.87423706054688, + "logps/rejected": -209.61061096191406, + "loss": 0.0369, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.7065603733062744, + "rewards/margins": 10.6439790725708, + "rewards/rejected": -13.350540161132812, + "step": 903 + }, + { + "epoch": 1.37, + "learning_rate": 2.701723656129851e-07, + "logits/chosen": -1.2732253074645996, + "logits/rejected": -1.2353371381759644, + "logps/chosen": -76.51490783691406, + "logps/rejected": -197.60107421875, + "loss": 0.0485, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.4052677154541016, + "rewards/margins": 10.876425743103027, + "rewards/rejected": -12.281692504882812, + "step": 904 + }, + { + "epoch": 1.37, + "learning_rate": 2.6899494953673204e-07, + "logits/chosen": -1.1939207315444946, + "logits/rejected": -1.193476915359497, + "logps/chosen": -99.97769165039062, + "logps/rejected": -203.38156127929688, + "loss": 0.0101, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.6601791381835938, + "rewards/margins": 10.198871612548828, + "rewards/rejected": -12.859050750732422, + "step": 905 + }, + { + "epoch": 1.38, + "learning_rate": 2.6781915982376124e-07, + "logits/chosen": -1.0214887857437134, + "logits/rejected": -0.8655680418014526, + "logps/chosen": -71.07418823242188, + "logps/rejected": -215.4347381591797, + "loss": 0.0225, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.7634833455085754, + "rewards/margins": 12.360746383666992, + "rewards/rejected": -13.12423038482666, + "step": 906 + }, + { + "epoch": 1.38, + "learning_rate": 2.666450047520784e-07, + "logits/chosen": -1.0108181238174438, + "logits/rejected": -0.9619965553283691, + "logps/chosen": -65.72087097167969, + "logps/rejected": -172.912109375, + "loss": 0.0156, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.2579199075698853, + "rewards/margins": 9.430094718933105, + "rewards/rejected": -10.68801498413086, + "step": 907 + }, + { + "epoch": 1.38, + "learning_rate": 2.6547249258818163e-07, + "logits/chosen": -1.0633748769760132, + "logits/rejected": -1.0480328798294067, + "logps/chosen": -62.42863464355469, + "logps/rejected": -164.85560607910156, + "loss": 0.0285, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.9218290448188782, + "rewards/margins": 9.805809020996094, + "rewards/rejected": -10.727638244628906, + "step": 908 + }, + { + "epoch": 1.38, + "learning_rate": 2.6430163158700113e-07, + "logits/chosen": -1.2999529838562012, + "logits/rejected": -1.3151614665985107, + "logps/chosen": -54.91654586791992, + "logps/rejected": -136.32664489746094, + "loss": 0.0252, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.7202343344688416, + "rewards/margins": 7.663908004760742, + "rewards/rejected": -8.38414192199707, + "step": 909 + }, + { + "epoch": 1.38, + "learning_rate": 2.631324299918436e-07, + "logits/chosen": -1.0010817050933838, + "logits/rejected": -0.9361670613288879, + "logps/chosen": -75.12582397460938, + "logps/rejected": -207.30267333984375, + "loss": 0.01, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.3898661136627197, + "rewards/margins": 11.849888801574707, + "rewards/rejected": -13.239754676818848, + "step": 910 + }, + { + "epoch": 1.38, + "learning_rate": 2.61964896034332e-07, + "logits/chosen": -1.0734792947769165, + "logits/rejected": -1.0554006099700928, + "logps/chosen": -69.49102020263672, + "logps/rejected": -178.4953155517578, + "loss": 0.0099, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.8538084030151367, + "rewards/margins": 9.01848030090332, + "rewards/rejected": -10.87228775024414, + "step": 911 + }, + { + "epoch": 1.39, + "learning_rate": 2.6079903793434887e-07, + "logits/chosen": -1.1782310009002686, + "logits/rejected": -1.2167466878890991, + "logps/chosen": -86.17131042480469, + "logps/rejected": -211.46607971191406, + "loss": 0.0062, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.6431995630264282, + "rewards/margins": 11.687231063842773, + "rewards/rejected": -13.33043098449707, + "step": 912 + }, + { + "epoch": 1.39, + "learning_rate": 2.596348638999778e-07, + "logits/chosen": -1.1233220100402832, + "logits/rejected": -1.0774215459823608, + "logps/chosen": -61.160343170166016, + "logps/rejected": -161.19154357910156, + "loss": 0.0522, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.0652896165847778, + "rewards/margins": 9.370325088500977, + "rewards/rejected": -10.435614585876465, + "step": 913 + }, + { + "epoch": 1.39, + "learning_rate": 2.584723821274464e-07, + "logits/chosen": -0.9774546027183533, + "logits/rejected": -0.8842563033103943, + "logps/chosen": -61.087642669677734, + "logps/rejected": -179.74330139160156, + "loss": 0.0279, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.837219774723053, + "rewards/margins": 9.888167381286621, + "rewards/rejected": -10.725385665893555, + "step": 914 + }, + { + "epoch": 1.39, + "learning_rate": 2.573116008010676e-07, + "logits/chosen": -1.2813230752944946, + "logits/rejected": -1.1978058815002441, + "logps/chosen": -68.40570068359375, + "logps/rejected": -208.50985717773438, + "loss": 0.055, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.7870779633522034, + "rewards/margins": 12.170300483703613, + "rewards/rejected": -12.957378387451172, + "step": 915 + }, + { + "epoch": 1.39, + "learning_rate": 2.561525280931828e-07, + "logits/chosen": -1.0995240211486816, + "logits/rejected": -1.116148829460144, + "logps/chosen": -84.81562042236328, + "logps/rejected": -194.66766357421875, + "loss": 0.0154, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.6737269163131714, + "rewards/margins": 10.005136489868164, + "rewards/rejected": -11.678862571716309, + "step": 916 + }, + { + "epoch": 1.39, + "learning_rate": 2.5499517216410395e-07, + "logits/chosen": -1.1118059158325195, + "logits/rejected": -1.1057324409484863, + "logps/chosen": -67.32406616210938, + "logps/rejected": -156.33616638183594, + "loss": 0.0089, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.0997236967086792, + "rewards/margins": 8.82157039642334, + "rewards/rejected": -9.921294212341309, + "step": 917 + }, + { + "epoch": 1.39, + "learning_rate": 2.5383954116205654e-07, + "logits/chosen": -1.3008923530578613, + "logits/rejected": -1.3098472356796265, + "logps/chosen": -76.5130615234375, + "logps/rejected": -188.46707153320312, + "loss": 0.0167, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.248567819595337, + "rewards/margins": 10.24660587310791, + "rewards/rejected": -11.495174407958984, + "step": 918 + }, + { + "epoch": 1.4, + "learning_rate": 2.526856432231216e-07, + "logits/chosen": -1.0838638544082642, + "logits/rejected": -0.9704316854476929, + "logps/chosen": -68.77642059326172, + "logps/rejected": -216.4132843017578, + "loss": 0.0203, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.5356925129890442, + "rewards/margins": 12.238455772399902, + "rewards/rejected": -12.774148941040039, + "step": 919 + }, + { + "epoch": 1.4, + "learning_rate": 2.5153348647117856e-07, + "logits/chosen": -1.1567587852478027, + "logits/rejected": -1.0584115982055664, + "logps/chosen": -90.18541717529297, + "logps/rejected": -229.05059814453125, + "loss": 0.0165, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.25400447845459, + "rewards/margins": 12.28674602508545, + "rewards/rejected": -14.540752410888672, + "step": 920 + }, + { + "epoch": 1.4, + "learning_rate": 2.5038307901784904e-07, + "logits/chosen": -1.1250687837600708, + "logits/rejected": -1.1044715642929077, + "logps/chosen": -44.823463439941406, + "logps/rejected": -126.68445587158203, + "loss": 0.0158, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.23479335010051727, + "rewards/margins": 7.561722278594971, + "rewards/rejected": -7.796515464782715, + "step": 921 + }, + { + "epoch": 1.4, + "learning_rate": 2.492344289624378e-07, + "logits/chosen": -1.1347781419754028, + "logits/rejected": -1.0972087383270264, + "logps/chosen": -69.84147644042969, + "logps/rejected": -156.32574462890625, + "loss": 0.0192, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.4627699851989746, + "rewards/margins": 8.338072776794434, + "rewards/rejected": -9.800844192504883, + "step": 922 + }, + { + "epoch": 1.4, + "learning_rate": 2.4808754439187787e-07, + "logits/chosen": -0.8981828689575195, + "logits/rejected": -0.7866100668907166, + "logps/chosen": -63.81999969482422, + "logps/rejected": -186.1649169921875, + "loss": 0.0132, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.6151156425476074, + "rewards/margins": 11.367277145385742, + "rewards/rejected": -11.982391357421875, + "step": 923 + }, + { + "epoch": 1.4, + "learning_rate": 2.469424333806718e-07, + "logits/chosen": -1.3061175346374512, + "logits/rejected": -1.2760297060012817, + "logps/chosen": -64.2739486694336, + "logps/rejected": -141.9420623779297, + "loss": 0.0105, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.9888159036636353, + "rewards/margins": 7.480759620666504, + "rewards/rejected": -8.469575881958008, + "step": 924 + }, + { + "epoch": 1.41, + "learning_rate": 2.457991039908366e-07, + "logits/chosen": -1.2425321340560913, + "logits/rejected": -1.1818774938583374, + "logps/chosen": -75.50288391113281, + "logps/rejected": -207.9646759033203, + "loss": 0.024, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.088285207748413, + "rewards/margins": 10.636645317077637, + "rewards/rejected": -11.724930763244629, + "step": 925 + }, + { + "epoch": 1.41, + "learning_rate": 2.446575642718445e-07, + "logits/chosen": -1.065239429473877, + "logits/rejected": -1.007594108581543, + "logps/chosen": -81.72013854980469, + "logps/rejected": -202.5800018310547, + "loss": 0.0064, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.3376941680908203, + "rewards/margins": 11.226885795593262, + "rewards/rejected": -12.564579963684082, + "step": 926 + }, + { + "epoch": 1.41, + "learning_rate": 2.435178222605694e-07, + "logits/chosen": -1.0966297388076782, + "logits/rejected": -1.0844006538391113, + "logps/chosen": -81.39220428466797, + "logps/rejected": -178.13729858398438, + "loss": 0.02, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.0578868389129639, + "rewards/margins": 9.506040573120117, + "rewards/rejected": -10.563926696777344, + "step": 927 + }, + { + "epoch": 1.41, + "learning_rate": 2.423798859812275e-07, + "logits/chosen": -1.332690715789795, + "logits/rejected": -1.3125532865524292, + "logps/chosen": -69.65312957763672, + "logps/rejected": -174.8885040283203, + "loss": 0.0075, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.8267205953598022, + "rewards/margins": 9.282591819763184, + "rewards/rejected": -11.109312057495117, + "step": 928 + }, + { + "epoch": 1.41, + "learning_rate": 2.4124376344532244e-07, + "logits/chosen": -1.3996777534484863, + "logits/rejected": -1.4283418655395508, + "logps/chosen": -86.49020385742188, + "logps/rejected": -194.10888671875, + "loss": 0.01, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.7857542037963867, + "rewards/margins": 10.515002250671387, + "rewards/rejected": -12.30075740814209, + "step": 929 + }, + { + "epoch": 1.41, + "learning_rate": 2.4010946265158815e-07, + "logits/chosen": -1.1814100742340088, + "logits/rejected": -1.0767438411712646, + "logps/chosen": -80.02114868164062, + "logps/rejected": -203.32337951660156, + "loss": 0.0198, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.471388578414917, + "rewards/margins": 11.117501258850098, + "rewards/rejected": -12.588889122009277, + "step": 930 + }, + { + "epoch": 1.41, + "learning_rate": 2.389769915859334e-07, + "logits/chosen": -1.0770986080169678, + "logits/rejected": -1.0264323949813843, + "logps/chosen": -71.16810607910156, + "logps/rejected": -199.77919006347656, + "loss": 0.026, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.39163076877594, + "rewards/margins": 11.245088577270508, + "rewards/rejected": -12.63671875, + "step": 931 + }, + { + "epoch": 1.42, + "learning_rate": 2.378463582213842e-07, + "logits/chosen": -1.1755002737045288, + "logits/rejected": -1.146294355392456, + "logps/chosen": -58.88164520263672, + "logps/rejected": -172.73194885253906, + "loss": 0.0053, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.0842905044555664, + "rewards/margins": 10.46119213104248, + "rewards/rejected": -11.545482635498047, + "step": 932 + }, + { + "epoch": 1.42, + "learning_rate": 2.3671757051802882e-07, + "logits/chosen": -1.2189280986785889, + "logits/rejected": -1.1856813430786133, + "logps/chosen": -65.66744995117188, + "logps/rejected": -178.7213134765625, + "loss": 0.0187, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.6168245077133179, + "rewards/margins": 9.953254699707031, + "rewards/rejected": -10.57007884979248, + "step": 933 + }, + { + "epoch": 1.42, + "learning_rate": 2.3559063642296163e-07, + "logits/chosen": -1.2218457460403442, + "logits/rejected": -1.1435271501541138, + "logps/chosen": -58.35125732421875, + "logps/rejected": -175.0428466796875, + "loss": 0.014, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.49362438917160034, + "rewards/margins": 10.824305534362793, + "rewards/rejected": -11.317931175231934, + "step": 934 + }, + { + "epoch": 1.42, + "learning_rate": 2.3446556387022644e-07, + "logits/chosen": -1.4056589603424072, + "logits/rejected": -1.2883325815200806, + "logps/chosen": -76.65266418457031, + "logps/rejected": -184.23593139648438, + "loss": 0.0118, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.369076728820801, + "rewards/margins": 9.001468658447266, + "rewards/rejected": -11.370546340942383, + "step": 935 + }, + { + "epoch": 1.42, + "learning_rate": 2.3334236078076126e-07, + "logits/chosen": -1.1952333450317383, + "logits/rejected": -1.1824768781661987, + "logps/chosen": -107.90875244140625, + "logps/rejected": -253.9272003173828, + "loss": 0.0241, + "rewards/accuracies": 0.9375, + "rewards/chosen": -3.239903211593628, + "rewards/margins": 12.69080924987793, + "rewards/rejected": -15.93071174621582, + "step": 936 + }, + { + "epoch": 1.42, + "learning_rate": 2.322210350623423e-07, + "logits/chosen": -1.2290904521942139, + "logits/rejected": -1.187170147895813, + "logps/chosen": -59.167877197265625, + "logps/rejected": -143.3047637939453, + "loss": 0.0157, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.4827635288238525, + "rewards/margins": 7.0325422286987305, + "rewards/rejected": -8.515304565429688, + "step": 937 + }, + { + "epoch": 1.42, + "learning_rate": 2.3110159460952894e-07, + "logits/chosen": -1.0048002004623413, + "logits/rejected": -1.0073782205581665, + "logps/chosen": -42.752349853515625, + "logps/rejected": -119.19047546386719, + "loss": 0.0169, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.19428841769695282, + "rewards/margins": 7.295149326324463, + "rewards/rejected": -7.100860595703125, + "step": 938 + }, + { + "epoch": 1.43, + "learning_rate": 2.2998404730360632e-07, + "logits/chosen": -1.1652112007141113, + "logits/rejected": -1.1065256595611572, + "logps/chosen": -86.21635437011719, + "logps/rejected": -182.9861602783203, + "loss": 0.0318, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.4450039863586426, + "rewards/margins": 8.922704696655273, + "rewards/rejected": -11.367709159851074, + "step": 939 + }, + { + "epoch": 1.43, + "learning_rate": 2.2886840101253247e-07, + "logits/chosen": -1.4147456884384155, + "logits/rejected": -1.3730257749557495, + "logps/chosen": -61.425506591796875, + "logps/rejected": -178.63778686523438, + "loss": 0.0085, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.104467749595642, + "rewards/margins": 10.780162811279297, + "rewards/rejected": -11.884631156921387, + "step": 940 + }, + { + "epoch": 1.43, + "learning_rate": 2.277546635908808e-07, + "logits/chosen": -1.211792230606079, + "logits/rejected": -1.1947301626205444, + "logps/chosen": -63.770347595214844, + "logps/rejected": -158.47808837890625, + "loss": 0.0093, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.442031741142273, + "rewards/margins": 8.224104881286621, + "rewards/rejected": -9.666136741638184, + "step": 941 + }, + { + "epoch": 1.43, + "learning_rate": 2.2664284287978568e-07, + "logits/chosen": -1.1722586154937744, + "logits/rejected": -1.0732977390289307, + "logps/chosen": -73.99296569824219, + "logps/rejected": -199.77081298828125, + "loss": 0.0115, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.9483563303947449, + "rewards/margins": 11.3140287399292, + "rewards/rejected": -12.262385368347168, + "step": 942 + }, + { + "epoch": 1.43, + "learning_rate": 2.2553294670688693e-07, + "logits/chosen": -1.2367777824401855, + "logits/rejected": -1.2036511898040771, + "logps/chosen": -68.31490325927734, + "logps/rejected": -156.6162872314453, + "loss": 0.0246, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.096004843711853, + "rewards/margins": 8.337125778198242, + "rewards/rejected": -9.433130264282227, + "step": 943 + }, + { + "epoch": 1.43, + "learning_rate": 2.2442498288627555e-07, + "logits/chosen": -1.0678187608718872, + "logits/rejected": -0.9708797931671143, + "logps/chosen": -58.706153869628906, + "logps/rejected": -194.00180053710938, + "loss": 0.0155, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.7698225975036621, + "rewards/margins": 11.374448776245117, + "rewards/rejected": -12.144271850585938, + "step": 944 + }, + { + "epoch": 1.44, + "learning_rate": 2.2331895921843736e-07, + "logits/chosen": -0.8866348266601562, + "logits/rejected": -0.8945915102958679, + "logps/chosen": -71.54261016845703, + "logps/rejected": -213.23483276367188, + "loss": 0.0078, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.1069694757461548, + "rewards/margins": 13.323320388793945, + "rewards/rejected": -14.430290222167969, + "step": 945 + }, + { + "epoch": 1.44, + "learning_rate": 2.2221488349019902e-07, + "logits/chosen": -0.9755781292915344, + "logits/rejected": -0.8473294377326965, + "logps/chosen": -62.1094856262207, + "logps/rejected": -177.36032104492188, + "loss": 0.0115, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.2787407636642456, + "rewards/margins": 10.158745765686035, + "rewards/rejected": -11.43748664855957, + "step": 946 + }, + { + "epoch": 1.44, + "learning_rate": 2.2111276347467273e-07, + "logits/chosen": -1.450989007949829, + "logits/rejected": -1.4887750148773193, + "logps/chosen": -49.60207748413086, + "logps/rejected": -119.30854797363281, + "loss": 0.0136, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.638967752456665, + "rewards/margins": 6.7576751708984375, + "rewards/rejected": -7.396642684936523, + "step": 947 + }, + { + "epoch": 1.44, + "learning_rate": 2.2001260693120232e-07, + "logits/chosen": -1.2514420747756958, + "logits/rejected": -1.2486284971237183, + "logps/chosen": -73.5145263671875, + "logps/rejected": -168.6020965576172, + "loss": 0.0066, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.3822462558746338, + "rewards/margins": 9.278196334838867, + "rewards/rejected": -10.660442352294922, + "step": 948 + }, + { + "epoch": 1.44, + "learning_rate": 2.189144216053075e-07, + "logits/chosen": -1.361900806427002, + "logits/rejected": -1.2939060926437378, + "logps/chosen": -66.65200805664062, + "logps/rejected": -193.84371948242188, + "loss": 0.0311, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.5823947787284851, + "rewards/margins": 11.211771011352539, + "rewards/rejected": -11.794166564941406, + "step": 949 + }, + { + "epoch": 1.44, + "learning_rate": 2.1781821522862982e-07, + "logits/chosen": -1.2740370035171509, + "logits/rejected": -1.25923490524292, + "logps/chosen": -83.86727905273438, + "logps/rejected": -196.86184692382812, + "loss": 0.0133, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.2171200513839722, + "rewards/margins": 10.744917869567871, + "rewards/rejected": -11.9620361328125, + "step": 950 + }, + { + "epoch": 1.44, + "learning_rate": 2.1672399551887882e-07, + "logits/chosen": -1.039217472076416, + "logits/rejected": -0.9383636116981506, + "logps/chosen": -87.74999237060547, + "logps/rejected": -198.6109161376953, + "loss": 0.0103, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.7518144845962524, + "rewards/margins": 10.704536437988281, + "rewards/rejected": -12.456352233886719, + "step": 951 + }, + { + "epoch": 1.45, + "learning_rate": 2.1563177017977657e-07, + "logits/chosen": -1.0178163051605225, + "logits/rejected": -1.0056424140930176, + "logps/chosen": -62.35600280761719, + "logps/rejected": -155.6488494873047, + "loss": 0.0157, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.5035408735275269, + "rewards/margins": 8.838569641113281, + "rewards/rejected": -9.342110633850098, + "step": 952 + }, + { + "epoch": 1.45, + "learning_rate": 2.1454154690100434e-07, + "logits/chosen": -0.967587947845459, + "logits/rejected": -0.9174187779426575, + "logps/chosen": -77.53561401367188, + "logps/rejected": -187.43785095214844, + "loss": 0.0177, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.443828821182251, + "rewards/margins": 10.672246932983398, + "rewards/rejected": -12.116077423095703, + "step": 953 + }, + { + "epoch": 1.45, + "learning_rate": 2.134533333581478e-07, + "logits/chosen": -1.0567262172698975, + "logits/rejected": -0.9337789416313171, + "logps/chosen": -63.441650390625, + "logps/rejected": -181.72671508789062, + "loss": 0.0176, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.6463605165481567, + "rewards/margins": 10.389936447143555, + "rewards/rejected": -11.036297798156738, + "step": 954 + }, + { + "epoch": 1.45, + "learning_rate": 2.1236713721264416e-07, + "logits/chosen": -1.3626728057861328, + "logits/rejected": -1.2640372514724731, + "logps/chosen": -73.54210662841797, + "logps/rejected": -187.09765625, + "loss": 0.0228, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.5929640531539917, + "rewards/margins": 9.273469924926758, + "rewards/rejected": -10.866434097290039, + "step": 955 + }, + { + "epoch": 1.45, + "learning_rate": 2.112829661117259e-07, + "logits/chosen": -1.2080678939819336, + "logits/rejected": -1.213911533355713, + "logps/chosen": -61.92439270019531, + "logps/rejected": -166.87149047851562, + "loss": 0.0113, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.23983648419380188, + "rewards/margins": 10.470162391662598, + "rewards/rejected": -10.709999084472656, + "step": 956 + }, + { + "epoch": 1.45, + "learning_rate": 2.1020082768837e-07, + "logits/chosen": -1.2203989028930664, + "logits/rejected": -0.993392825126648, + "logps/chosen": -79.013427734375, + "logps/rejected": -252.40550231933594, + "loss": 0.0055, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.2387796640396118, + "rewards/margins": 14.322129249572754, + "rewards/rejected": -15.560907363891602, + "step": 957 + }, + { + "epoch": 1.46, + "learning_rate": 2.0912072956124166e-07, + "logits/chosen": -0.9172457456588745, + "logits/rejected": -0.803143322467804, + "logps/chosen": -72.17263793945312, + "logps/rejected": -193.0008544921875, + "loss": 0.0139, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.4307657480239868, + "rewards/margins": 10.762880325317383, + "rewards/rejected": -12.193644523620605, + "step": 958 + }, + { + "epoch": 1.46, + "learning_rate": 2.0804267933464192e-07, + "logits/chosen": -1.259220004081726, + "logits/rejected": -1.3154269456863403, + "logps/chosen": -68.33270263671875, + "logps/rejected": -174.4493408203125, + "loss": 0.0351, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.544312059879303, + "rewards/margins": 10.16675090789795, + "rewards/rejected": -10.711063385009766, + "step": 959 + }, + { + "epoch": 1.46, + "learning_rate": 2.0696668459845352e-07, + "logits/chosen": -1.0741063356399536, + "logits/rejected": -1.010119080543518, + "logps/chosen": -57.818946838378906, + "logps/rejected": -186.72503662109375, + "loss": 0.0076, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.5061759948730469, + "rewards/margins": 11.49766731262207, + "rewards/rejected": -12.003843307495117, + "step": 960 + }, + { + "epoch": 1.46, + "learning_rate": 2.0589275292808844e-07, + "logits/chosen": -1.4062213897705078, + "logits/rejected": -1.4807547330856323, + "logps/chosen": -71.68291473388672, + "logps/rejected": -164.98974609375, + "loss": 0.0077, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.5269719362258911, + "rewards/margins": 9.205770492553711, + "rewards/rejected": -9.732741355895996, + "step": 961 + }, + { + "epoch": 1.46, + "learning_rate": 2.048208918844333e-07, + "logits/chosen": -1.1317811012268066, + "logits/rejected": -1.110825538635254, + "logps/chosen": -88.1626205444336, + "logps/rejected": -243.67288208007812, + "loss": 0.011, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.1779937744140625, + "rewards/margins": 13.70100212097168, + "rewards/rejected": -14.878996849060059, + "step": 962 + }, + { + "epoch": 1.46, + "learning_rate": 2.0375110901379672e-07, + "logits/chosen": -1.1075657606124878, + "logits/rejected": -1.0189461708068848, + "logps/chosen": -83.45532989501953, + "logps/rejected": -226.39210510253906, + "loss": 0.007, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.797542929649353, + "rewards/margins": 12.032994270324707, + "rewards/rejected": -13.830537796020508, + "step": 963 + }, + { + "epoch": 1.46, + "learning_rate": 2.026834118478567e-07, + "logits/chosen": -1.2279272079467773, + "logits/rejected": -1.0915871858596802, + "logps/chosen": -90.93171691894531, + "logps/rejected": -213.2340545654297, + "loss": 0.017, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.0455241203308105, + "rewards/margins": 10.05965805053711, + "rewards/rejected": -13.105180740356445, + "step": 964 + }, + { + "epoch": 1.47, + "learning_rate": 2.0161780790360656e-07, + "logits/chosen": -1.2595267295837402, + "logits/rejected": -1.1609396934509277, + "logps/chosen": -93.37518310546875, + "logps/rejected": -227.83084106445312, + "loss": 0.0252, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.054603099822998, + "rewards/margins": 12.473885536193848, + "rewards/rejected": -14.528489112854004, + "step": 965 + }, + { + "epoch": 1.47, + "learning_rate": 2.005543046833028e-07, + "logits/chosen": -1.173004388809204, + "logits/rejected": -1.1194164752960205, + "logps/chosen": -77.56388854980469, + "logps/rejected": -209.70628356933594, + "loss": 0.0085, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.4635385274887085, + "rewards/margins": 11.74592399597168, + "rewards/rejected": -13.209463119506836, + "step": 966 + }, + { + "epoch": 1.47, + "learning_rate": 1.994929096744118e-07, + "logits/chosen": -1.002015471458435, + "logits/rejected": -0.8927529454231262, + "logps/chosen": -84.05574798583984, + "logps/rejected": -219.0233917236328, + "loss": 0.0256, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.806456208229065, + "rewards/margins": 11.940408706665039, + "rewards/rejected": -13.746864318847656, + "step": 967 + }, + { + "epoch": 1.47, + "learning_rate": 1.9843363034955795e-07, + "logits/chosen": -1.0354502201080322, + "logits/rejected": -0.826656699180603, + "logps/chosen": -75.4397201538086, + "logps/rejected": -226.78677368164062, + "loss": 0.0515, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.4359992742538452, + "rewards/margins": 13.272261619567871, + "rewards/rejected": -14.708261489868164, + "step": 968 + }, + { + "epoch": 1.47, + "learning_rate": 1.9737647416646935e-07, + "logits/chosen": -0.9954381585121155, + "logits/rejected": -0.926822304725647, + "logps/chosen": -69.42559814453125, + "logps/rejected": -169.87826538085938, + "loss": 0.0144, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.4862395524978638, + "rewards/margins": 9.249549865722656, + "rewards/rejected": -10.735790252685547, + "step": 969 + }, + { + "epoch": 1.47, + "learning_rate": 1.9632144856792748e-07, + "logits/chosen": -1.2075226306915283, + "logits/rejected": -1.0959432125091553, + "logps/chosen": -83.94832611083984, + "logps/rejected": -228.6538543701172, + "loss": 0.012, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.8144811391830444, + "rewards/margins": 12.6270751953125, + "rewards/rejected": -14.441555976867676, + "step": 970 + }, + { + "epoch": 1.48, + "learning_rate": 1.9526856098171285e-07, + "logits/chosen": -1.2695893049240112, + "logits/rejected": -1.1151591539382935, + "logps/chosen": -65.54235076904297, + "logps/rejected": -203.7481689453125, + "loss": 0.0151, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.806349754333496, + "rewards/margins": 11.04599380493164, + "rewards/rejected": -12.852344512939453, + "step": 971 + }, + { + "epoch": 1.48, + "learning_rate": 1.9421781882055443e-07, + "logits/chosen": -1.1956473588943481, + "logits/rejected": -1.1801730394363403, + "logps/chosen": -71.11809539794922, + "logps/rejected": -146.0307159423828, + "loss": 0.0205, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.5277948379516602, + "rewards/margins": 7.8370041847229, + "rewards/rejected": -9.364798545837402, + "step": 972 + }, + { + "epoch": 1.48, + "learning_rate": 1.9316922948207542e-07, + "logits/chosen": -1.0702711343765259, + "logits/rejected": -1.00505793094635, + "logps/chosen": -76.5950927734375, + "logps/rejected": -201.7117462158203, + "loss": 0.0205, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.2814533710479736, + "rewards/margins": 11.382299423217773, + "rewards/rejected": -12.663751602172852, + "step": 973 + }, + { + "epoch": 1.48, + "learning_rate": 1.921228003487435e-07, + "logits/chosen": -1.1192896366119385, + "logits/rejected": -1.006330966949463, + "logps/chosen": -82.68030548095703, + "logps/rejected": -230.17153930664062, + "loss": 0.055, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.709291696548462, + "rewards/margins": 12.382524490356445, + "rewards/rejected": -14.091817855834961, + "step": 974 + }, + { + "epoch": 1.48, + "learning_rate": 1.9107853878781693e-07, + "logits/chosen": -0.970392644405365, + "logits/rejected": -0.9290090203285217, + "logps/chosen": -71.13276672363281, + "logps/rejected": -173.7963409423828, + "loss": 0.0254, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.4579185247421265, + "rewards/margins": 10.079607963562012, + "rewards/rejected": -11.537527084350586, + "step": 975 + }, + { + "epoch": 1.48, + "learning_rate": 1.9003645215129355e-07, + "logits/chosen": -0.8659820556640625, + "logits/rejected": -0.9184397459030151, + "logps/chosen": -51.90268325805664, + "logps/rejected": -124.06462097167969, + "loss": 0.0266, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.5599133372306824, + "rewards/margins": 7.936857223510742, + "rewards/rejected": -8.496770858764648, + "step": 976 + }, + { + "epoch": 1.48, + "learning_rate": 1.8899654777585932e-07, + "logits/chosen": -1.1589514017105103, + "logits/rejected": -1.1574431657791138, + "logps/chosen": -79.15308380126953, + "logps/rejected": -195.00845336914062, + "loss": 0.0182, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.4611399173736572, + "rewards/margins": 10.257494926452637, + "rewards/rejected": -12.718634605407715, + "step": 977 + }, + { + "epoch": 1.49, + "learning_rate": 1.8795883298283583e-07, + "logits/chosen": -0.9930282235145569, + "logits/rejected": -0.9695369005203247, + "logps/chosen": -70.81267547607422, + "logps/rejected": -166.0614776611328, + "loss": 0.0217, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.1345221996307373, + "rewards/margins": 8.868050575256348, + "rewards/rejected": -11.002573013305664, + "step": 978 + }, + { + "epoch": 1.49, + "learning_rate": 1.8692331507812925e-07, + "logits/chosen": -1.2379798889160156, + "logits/rejected": -1.1774482727050781, + "logps/chosen": -74.19889068603516, + "logps/rejected": -196.72525024414062, + "loss": 0.0102, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.3309439420700073, + "rewards/margins": 11.356307029724121, + "rewards/rejected": -12.687250137329102, + "step": 979 + }, + { + "epoch": 1.49, + "learning_rate": 1.858900013521788e-07, + "logits/chosen": -1.2760841846466064, + "logits/rejected": -1.3011666536331177, + "logps/chosen": -78.61241149902344, + "logps/rejected": -183.62718200683594, + "loss": 0.0153, + "rewards/accuracies": 0.9375, + "rewards/chosen": -2.298687696456909, + "rewards/margins": 9.355890274047852, + "rewards/rejected": -11.65457820892334, + "step": 980 + }, + { + "epoch": 1.49, + "learning_rate": 1.8485889907990576e-07, + "logits/chosen": -1.2428977489471436, + "logits/rejected": -1.201936960220337, + "logps/chosen": -68.67544555664062, + "logps/rejected": -179.7180938720703, + "loss": 0.0083, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.5715904235839844, + "rewards/margins": 10.29904556274414, + "rewards/rejected": -11.870635986328125, + "step": 981 + }, + { + "epoch": 1.49, + "learning_rate": 1.8383001552066162e-07, + "logits/chosen": -0.9922041893005371, + "logits/rejected": -0.9825817346572876, + "logps/chosen": -59.647212982177734, + "logps/rejected": -136.5089569091797, + "loss": 0.0113, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.3703937530517578, + "rewards/margins": 7.617739677429199, + "rewards/rejected": -8.988134384155273, + "step": 982 + }, + { + "epoch": 1.49, + "learning_rate": 1.828033579181773e-07, + "logits/chosen": -1.0681895017623901, + "logits/rejected": -0.9357763528823853, + "logps/chosen": -62.36621856689453, + "logps/rejected": -179.41033935546875, + "loss": 0.0172, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.1313633918762207, + "rewards/margins": 10.333325386047363, + "rewards/rejected": -11.464689254760742, + "step": 983 + }, + { + "epoch": 1.49, + "learning_rate": 1.817789335005121e-07, + "logits/chosen": -1.045304536819458, + "logits/rejected": -1.0401690006256104, + "logps/chosen": -71.23977661132812, + "logps/rejected": -191.34725952148438, + "loss": 0.0207, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.3006216287612915, + "rewards/margins": 11.322778701782227, + "rewards/rejected": -12.623400688171387, + "step": 984 + }, + { + "epoch": 1.5, + "learning_rate": 1.807567494800034e-07, + "logits/chosen": -1.2853546142578125, + "logits/rejected": -1.1250349283218384, + "logps/chosen": -85.35986328125, + "logps/rejected": -244.93942260742188, + "loss": 0.0318, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.169968605041504, + "rewards/margins": 13.561067581176758, + "rewards/rejected": -15.731036186218262, + "step": 985 + }, + { + "epoch": 1.5, + "learning_rate": 1.7973681305321426e-07, + "logits/chosen": -1.049625039100647, + "logits/rejected": -0.8729156255722046, + "logps/chosen": -73.04606628417969, + "logps/rejected": -207.68057250976562, + "loss": 0.0102, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.765155553817749, + "rewards/margins": 11.577472686767578, + "rewards/rejected": -13.342628479003906, + "step": 986 + }, + { + "epoch": 1.5, + "learning_rate": 1.7871913140088497e-07, + "logits/chosen": -1.1029876470565796, + "logits/rejected": -1.0141730308532715, + "logps/chosen": -93.17534637451172, + "logps/rejected": -228.5102081298828, + "loss": 0.0516, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.936983346939087, + "rewards/margins": 11.797006607055664, + "rewards/rejected": -14.733988761901855, + "step": 987 + }, + { + "epoch": 1.5, + "learning_rate": 1.777037116878804e-07, + "logits/chosen": -1.1183948516845703, + "logits/rejected": -1.0615265369415283, + "logps/chosen": -69.53657531738281, + "logps/rejected": -185.73117065429688, + "loss": 0.0221, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.2220745086669922, + "rewards/margins": 9.990640640258789, + "rewards/rejected": -11.212716102600098, + "step": 988 + }, + { + "epoch": 1.5, + "learning_rate": 1.7669056106314162e-07, + "logits/chosen": -1.1812067031860352, + "logits/rejected": -1.1323652267456055, + "logps/chosen": -89.53622436523438, + "logps/rejected": -201.0779571533203, + "loss": 0.0069, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.0934267044067383, + "rewards/margins": 10.49223804473877, + "rewards/rejected": -12.585663795471191, + "step": 989 + }, + { + "epoch": 1.5, + "learning_rate": 1.7567968665963296e-07, + "logits/chosen": -1.2452868223190308, + "logits/rejected": -1.234442114830017, + "logps/chosen": -75.95500183105469, + "logps/rejected": -168.5236053466797, + "loss": 0.0284, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.4998321533203125, + "rewards/margins": 8.998800277709961, + "rewards/rejected": -10.498632431030273, + "step": 990 + }, + { + "epoch": 1.51, + "learning_rate": 1.7467109559429466e-07, + "logits/chosen": -0.9743750691413879, + "logits/rejected": -0.8114648461341858, + "logps/chosen": -91.20507049560547, + "logps/rejected": -236.1385498046875, + "loss": 0.0101, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.929940700531006, + "rewards/margins": 12.097036361694336, + "rewards/rejected": -15.026976585388184, + "step": 991 + }, + { + "epoch": 1.51, + "learning_rate": 1.7366479496799074e-07, + "logits/chosen": -1.07231605052948, + "logits/rejected": -1.0738083124160767, + "logps/chosen": -88.92713928222656, + "logps/rejected": -203.30935668945312, + "loss": 0.0319, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.9227832555770874, + "rewards/margins": 10.71860122680664, + "rewards/rejected": -12.641385078430176, + "step": 992 + }, + { + "epoch": 1.51, + "learning_rate": 1.7266079186545956e-07, + "logits/chosen": -1.1185206174850464, + "logits/rejected": -1.049841046333313, + "logps/chosen": -77.78147888183594, + "logps/rejected": -178.53648376464844, + "loss": 0.0412, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.051157236099243, + "rewards/margins": 9.125386238098145, + "rewards/rejected": -11.176543235778809, + "step": 993 + }, + { + "epoch": 1.51, + "learning_rate": 1.7165909335526453e-07, + "logits/chosen": -0.9872387051582336, + "logits/rejected": -0.817636251449585, + "logps/chosen": -98.48197937011719, + "logps/rejected": -267.2848205566406, + "loss": 0.013, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.6456620693206787, + "rewards/margins": 13.687142372131348, + "rewards/rejected": -16.332805633544922, + "step": 994 + }, + { + "epoch": 1.51, + "learning_rate": 1.7065970648974343e-07, + "logits/chosen": -1.0143423080444336, + "logits/rejected": -0.9600812792778015, + "logps/chosen": -59.890235900878906, + "logps/rejected": -154.62332153320312, + "loss": 0.0101, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.262808620929718, + "rewards/margins": 8.929404258728027, + "rewards/rejected": -9.19221305847168, + "step": 995 + }, + { + "epoch": 1.51, + "learning_rate": 1.6966263830495935e-07, + "logits/chosen": -1.0823190212249756, + "logits/rejected": -1.0538568496704102, + "logps/chosen": -62.140281677246094, + "logps/rejected": -173.5354461669922, + "loss": 0.0152, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.23087503015995026, + "rewards/margins": 10.62917709350586, + "rewards/rejected": -10.860052108764648, + "step": 996 + }, + { + "epoch": 1.51, + "learning_rate": 1.6866789582065078e-07, + "logits/chosen": -1.244469404220581, + "logits/rejected": -1.1900691986083984, + "logps/chosen": -72.99565124511719, + "logps/rejected": -180.4878387451172, + "loss": 0.0341, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.373180627822876, + "rewards/margins": 9.928434371948242, + "rewards/rejected": -11.301615715026855, + "step": 997 + }, + { + "epoch": 1.52, + "learning_rate": 1.6767548604018289e-07, + "logits/chosen": -1.1198558807373047, + "logits/rejected": -1.0448769330978394, + "logps/chosen": -90.15465545654297, + "logps/rejected": -233.24017333984375, + "loss": 0.0103, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.387511968612671, + "rewards/margins": 12.50436019897461, + "rewards/rejected": -14.89187240600586, + "step": 998 + }, + { + "epoch": 1.52, + "learning_rate": 1.6668541595049724e-07, + "logits/chosen": -1.2688748836517334, + "logits/rejected": -1.2117234468460083, + "logps/chosen": -74.76568603515625, + "logps/rejected": -207.88917541503906, + "loss": 0.0113, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.0294783115386963, + "rewards/margins": 12.35433578491211, + "rewards/rejected": -13.383813858032227, + "step": 999 + }, + { + "epoch": 1.52, + "learning_rate": 1.6569769252206328e-07, + "logits/chosen": -1.2145278453826904, + "logits/rejected": -1.1683098077774048, + "logps/chosen": -89.501953125, + "logps/rejected": -212.30300903320312, + "loss": 0.0166, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.642409324645996, + "rewards/margins": 11.852131843566895, + "rewards/rejected": -13.49454116821289, + "step": 1000 + }, + { + "epoch": 1.52, + "learning_rate": 1.6471232270882883e-07, + "logits/chosen": -1.2460477352142334, + "logits/rejected": -1.2110188007354736, + "logps/chosen": -73.52095031738281, + "logps/rejected": -198.0554656982422, + "loss": 0.0145, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.7613444328308105, + "rewards/margins": 11.186271667480469, + "rewards/rejected": -12.947614669799805, + "step": 1001 + }, + { + "epoch": 1.52, + "learning_rate": 1.6372931344817214e-07, + "logits/chosen": -1.0738894939422607, + "logits/rejected": -0.957828938961029, + "logps/chosen": -100.32572174072266, + "logps/rejected": -272.6159362792969, + "loss": 0.0162, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.615443706512451, + "rewards/margins": 14.619828224182129, + "rewards/rejected": -17.235271453857422, + "step": 1002 + }, + { + "epoch": 1.52, + "learning_rate": 1.6274867166085105e-07, + "logits/chosen": -1.0579982995986938, + "logits/rejected": -0.90909343957901, + "logps/chosen": -82.44677734375, + "logps/rejected": -228.25204467773438, + "loss": 0.0324, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.4460911750793457, + "rewards/margins": 13.131321907043457, + "rewards/rejected": -14.577413558959961, + "step": 1003 + }, + { + "epoch": 1.53, + "learning_rate": 1.6177040425095663e-07, + "logits/chosen": -0.9436768889427185, + "logits/rejected": -0.8684688806533813, + "logps/chosen": -64.3193130493164, + "logps/rejected": -185.3251953125, + "loss": 0.0178, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.9520430564880371, + "rewards/margins": 11.001495361328125, + "rewards/rejected": -11.95353889465332, + "step": 1004 + }, + { + "epoch": 1.53, + "learning_rate": 1.6079451810586276e-07, + "logits/chosen": -1.2696284055709839, + "logits/rejected": -1.1688989400863647, + "logps/chosen": -83.22187805175781, + "logps/rejected": -239.7696533203125, + "loss": 0.0077, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.726318597793579, + "rewards/margins": 13.411822319030762, + "rewards/rejected": -15.138140678405762, + "step": 1005 + }, + { + "epoch": 1.53, + "learning_rate": 1.5982102009617832e-07, + "logits/chosen": -1.1807959079742432, + "logits/rejected": -1.1397391557693481, + "logps/chosen": -65.00617980957031, + "logps/rejected": -173.939697265625, + "loss": 0.0237, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.8103467226028442, + "rewards/margins": 10.269601821899414, + "rewards/rejected": -11.079949378967285, + "step": 1006 + }, + { + "epoch": 1.53, + "learning_rate": 1.5884991707569945e-07, + "logits/chosen": -1.0643589496612549, + "logits/rejected": -1.0183255672454834, + "logps/chosen": -80.94669342041016, + "logps/rejected": -239.7604522705078, + "loss": 0.0506, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.8289381265640259, + "rewards/margins": 13.630366325378418, + "rewards/rejected": -15.45930290222168, + "step": 1007 + }, + { + "epoch": 1.53, + "learning_rate": 1.5788121588135972e-07, + "logits/chosen": -1.4639328718185425, + "logits/rejected": -1.3777185678482056, + "logps/chosen": -74.71759796142578, + "logps/rejected": -170.49945068359375, + "loss": 0.0187, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.5641496181488037, + "rewards/margins": 8.775116920471191, + "rewards/rejected": -10.339265823364258, + "step": 1008 + }, + { + "epoch": 1.53, + "learning_rate": 1.5691492333318402e-07, + "logits/chosen": -1.0215314626693726, + "logits/rejected": -0.8911815285682678, + "logps/chosen": -59.70145034790039, + "logps/rejected": -179.14625549316406, + "loss": 0.0166, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.588193416595459, + "rewards/margins": 10.755075454711914, + "rewards/rejected": -11.343268394470215, + "step": 1009 + }, + { + "epoch": 1.53, + "learning_rate": 1.559510462342381e-07, + "logits/chosen": -1.0152214765548706, + "logits/rejected": -0.9721912741661072, + "logps/chosen": -64.24443817138672, + "logps/rejected": -162.21929931640625, + "loss": 0.0244, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.1861426830291748, + "rewards/margins": 9.191162109375, + "rewards/rejected": -10.377304077148438, + "step": 1010 + }, + { + "epoch": 1.54, + "learning_rate": 1.5498959137058339e-07, + "logits/chosen": -1.160400152206421, + "logits/rejected": -1.0179263353347778, + "logps/chosen": -77.59323120117188, + "logps/rejected": -207.6278533935547, + "loss": 0.0062, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.159334659576416, + "rewards/margins": 12.358423233032227, + "rewards/rejected": -13.5177583694458, + "step": 1011 + }, + { + "epoch": 1.54, + "learning_rate": 1.5403056551122694e-07, + "logits/chosen": -0.99965500831604, + "logits/rejected": -0.8801881074905396, + "logps/chosen": -63.581539154052734, + "logps/rejected": -166.3328857421875, + "loss": 0.0106, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.1613078117370605, + "rewards/margins": 9.462004661560059, + "rewards/rejected": -10.623311996459961, + "step": 1012 + }, + { + "epoch": 1.54, + "learning_rate": 1.530739754080751e-07, + "logits/chosen": -1.1453322172164917, + "logits/rejected": -1.1141060590744019, + "logps/chosen": -69.9078369140625, + "logps/rejected": -170.01576232910156, + "loss": 0.0064, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.9004390239715576, + "rewards/margins": 9.831196784973145, + "rewards/rejected": -10.731636047363281, + "step": 1013 + }, + { + "epoch": 1.54, + "learning_rate": 1.5211982779588534e-07, + "logits/chosen": -1.238998293876648, + "logits/rejected": -1.168033242225647, + "logps/chosen": -44.39051055908203, + "logps/rejected": -132.7353515625, + "loss": 0.0284, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.0709717869758606, + "rewards/margins": 8.28502368927002, + "rewards/rejected": -8.214052200317383, + "step": 1014 + }, + { + "epoch": 1.54, + "learning_rate": 1.5116812939221962e-07, + "logits/chosen": -1.2873257398605347, + "logits/rejected": -1.162929892539978, + "logps/chosen": -84.09979248046875, + "logps/rejected": -211.19015502929688, + "loss": 0.01, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.2492868900299072, + "rewards/margins": 10.819801330566406, + "rewards/rejected": -13.069087028503418, + "step": 1015 + }, + { + "epoch": 1.54, + "learning_rate": 1.5021888689739547e-07, + "logits/chosen": -0.9515085816383362, + "logits/rejected": -0.8473897576332092, + "logps/chosen": -65.8515853881836, + "logps/rejected": -222.41104125976562, + "loss": 0.0073, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.4497348368167877, + "rewards/margins": 13.304035186767578, + "rewards/rejected": -13.753767013549805, + "step": 1016 + }, + { + "epoch": 1.55, + "learning_rate": 1.4927210699444103e-07, + "logits/chosen": -1.0177326202392578, + "logits/rejected": -0.946979820728302, + "logps/chosen": -101.52448272705078, + "logps/rejected": -212.36209106445312, + "loss": 0.0078, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.146263360977173, + "rewards/margins": 10.454294204711914, + "rewards/rejected": -13.600556373596191, + "step": 1017 + }, + { + "epoch": 1.55, + "learning_rate": 1.4832779634904608e-07, + "logits/chosen": -1.0603152513504028, + "logits/rejected": -0.9408301711082458, + "logps/chosen": -73.39222717285156, + "logps/rejected": -208.1505126953125, + "loss": 0.0063, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.2289427518844604, + "rewards/margins": 12.148762702941895, + "rewards/rejected": -13.377705574035645, + "step": 1018 + }, + { + "epoch": 1.55, + "learning_rate": 1.4738596160951645e-07, + "logits/chosen": -1.0486927032470703, + "logits/rejected": -0.9102394580841064, + "logps/chosen": -61.55266571044922, + "logps/rejected": -188.68402099609375, + "loss": 0.0115, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.9976965188980103, + "rewards/margins": 11.183516502380371, + "rewards/rejected": -12.18121337890625, + "step": 1019 + }, + { + "epoch": 1.55, + "learning_rate": 1.4644660940672627e-07, + "logits/chosen": -1.242224931716919, + "logits/rejected": -1.1961392164230347, + "logps/chosen": -61.681758880615234, + "logps/rejected": -154.76602172851562, + "loss": 0.0424, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.1015557050704956, + "rewards/margins": 8.659936904907227, + "rewards/rejected": -9.761491775512695, + "step": 1020 + }, + { + "epoch": 1.55, + "learning_rate": 1.455097463540717e-07, + "logits/chosen": -1.018207311630249, + "logits/rejected": -0.7073631286621094, + "logps/chosen": -84.21482849121094, + "logps/rejected": -262.6379699707031, + "loss": 0.0028, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.0382328033447266, + "rewards/margins": 14.893991470336914, + "rewards/rejected": -16.932226181030273, + "step": 1021 + }, + { + "epoch": 1.55, + "learning_rate": 1.445753790474245e-07, + "logits/chosen": -1.041068196296692, + "logits/rejected": -0.9955125451087952, + "logps/chosen": -67.77099609375, + "logps/rejected": -174.4007110595703, + "loss": 0.0059, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.6224466562271118, + "rewards/margins": 9.725898742675781, + "rewards/rejected": -11.348344802856445, + "step": 1022 + }, + { + "epoch": 1.55, + "learning_rate": 1.436435140650852e-07, + "logits/chosen": -1.3501616716384888, + "logits/rejected": -1.4464925527572632, + "logps/chosen": -88.64131927490234, + "logps/rejected": -190.63385009765625, + "loss": 0.0155, + "rewards/accuracies": 0.9375, + "rewards/chosen": -2.100098133087158, + "rewards/margins": 9.95816421508789, + "rewards/rejected": -12.058262825012207, + "step": 1023 + }, + { + "epoch": 1.56, + "learning_rate": 1.427141579677374e-07, + "logits/chosen": -0.9608927369117737, + "logits/rejected": -0.9767427444458008, + "logps/chosen": -54.372188568115234, + "logps/rejected": -143.82839965820312, + "loss": 0.0106, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.326077938079834, + "rewards/margins": 8.830038070678711, + "rewards/rejected": -9.156116485595703, + "step": 1024 + }, + { + "epoch": 1.56, + "learning_rate": 1.417873172984006e-07, + "logits/chosen": -0.7643793225288391, + "logits/rejected": -0.7025930881500244, + "logps/chosen": -54.08414077758789, + "logps/rejected": -185.32861328125, + "loss": 0.0027, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.43912678956985474, + "rewards/margins": 12.152729988098145, + "rewards/rejected": -12.591856956481934, + "step": 1025 + }, + { + "epoch": 1.56, + "learning_rate": 1.408629985823857e-07, + "logits/chosen": -1.1674954891204834, + "logits/rejected": -1.1374739408493042, + "logps/chosen": -76.21036529541016, + "logps/rejected": -189.28546142578125, + "loss": 0.016, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.2749236822128296, + "rewards/margins": 11.543883323669434, + "rewards/rejected": -12.818806648254395, + "step": 1026 + }, + { + "epoch": 1.56, + "learning_rate": 1.3994120832724677e-07, + "logits/chosen": -1.1271106004714966, + "logits/rejected": -1.1465332508087158, + "logps/chosen": -67.489990234375, + "logps/rejected": -147.8130645751953, + "loss": 0.0352, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.7015074491500854, + "rewards/margins": 7.949741840362549, + "rewards/rejected": -9.651248931884766, + "step": 1027 + }, + { + "epoch": 1.56, + "learning_rate": 1.3902195302273778e-07, + "logits/chosen": -1.2397160530090332, + "logits/rejected": -1.264036774635315, + "logps/chosen": -48.882606506347656, + "logps/rejected": -148.37850952148438, + "loss": 0.0176, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.11703987419605255, + "rewards/margins": 9.144170761108398, + "rewards/rejected": -9.027130126953125, + "step": 1028 + }, + { + "epoch": 1.56, + "learning_rate": 1.38105239140765e-07, + "logits/chosen": -1.2493691444396973, + "logits/rejected": -1.210587739944458, + "logps/chosen": -59.75130844116211, + "logps/rejected": -181.18606567382812, + "loss": 0.0197, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.3405154347419739, + "rewards/margins": 11.08544921875, + "rewards/rejected": -11.425966262817383, + "step": 1029 + }, + { + "epoch": 1.56, + "learning_rate": 1.3719107313534223e-07, + "logits/chosen": -0.9990692734718323, + "logits/rejected": -0.9542756080627441, + "logps/chosen": -70.06559753417969, + "logps/rejected": -171.16131591796875, + "loss": 0.0176, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.5102357864379883, + "rewards/margins": 8.921667098999023, + "rewards/rejected": -10.431903839111328, + "step": 1030 + }, + { + "epoch": 1.57, + "learning_rate": 1.362794614425452e-07, + "logits/chosen": -1.1448475122451782, + "logits/rejected": -1.093543529510498, + "logps/chosen": -79.64584350585938, + "logps/rejected": -173.28114318847656, + "loss": 0.0118, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.9164886474609375, + "rewards/margins": 8.729981422424316, + "rewards/rejected": -10.646470069885254, + "step": 1031 + }, + { + "epoch": 1.57, + "learning_rate": 1.3537041048046692e-07, + "logits/chosen": -1.044460415840149, + "logits/rejected": -1.036932349205017, + "logps/chosen": -70.70366668701172, + "logps/rejected": -182.9157257080078, + "loss": 0.0102, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.5457505583763123, + "rewards/margins": 11.093820571899414, + "rewards/rejected": -11.639572143554688, + "step": 1032 + }, + { + "epoch": 1.57, + "learning_rate": 1.344639266491708e-07, + "logits/chosen": -1.442973017692566, + "logits/rejected": -1.3494670391082764, + "logps/chosen": -73.3709487915039, + "logps/rejected": -219.76773071289062, + "loss": 0.0307, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.2042829990386963, + "rewards/margins": 13.003188133239746, + "rewards/rejected": -14.20747184753418, + "step": 1033 + }, + { + "epoch": 1.57, + "learning_rate": 1.3356001633064761e-07, + "logits/chosen": -1.2116889953613281, + "logits/rejected": -1.123871088027954, + "logps/chosen": -72.24166107177734, + "logps/rejected": -189.9979248046875, + "loss": 0.0113, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.5618987083435059, + "rewards/margins": 10.573122024536133, + "rewards/rejected": -12.135021209716797, + "step": 1034 + }, + { + "epoch": 1.57, + "learning_rate": 1.32658685888769e-07, + "logits/chosen": -1.1446404457092285, + "logits/rejected": -1.1779420375823975, + "logps/chosen": -79.43793487548828, + "logps/rejected": -170.514404296875, + "loss": 0.0253, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.1790692806243896, + "rewards/margins": 9.217103958129883, + "rewards/rejected": -11.396173477172852, + "step": 1035 + }, + { + "epoch": 1.57, + "learning_rate": 1.3175994166924392e-07, + "logits/chosen": -1.1650694608688354, + "logits/rejected": -1.0598572492599487, + "logps/chosen": -65.18163299560547, + "logps/rejected": -196.41819763183594, + "loss": 0.0057, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.2281684875488281, + "rewards/margins": 11.982068061828613, + "rewards/rejected": -13.210236549377441, + "step": 1036 + }, + { + "epoch": 1.58, + "learning_rate": 1.3086378999957276e-07, + "logits/chosen": -1.1600673198699951, + "logits/rejected": -1.049504041671753, + "logps/chosen": -97.90890502929688, + "logps/rejected": -248.05784606933594, + "loss": 0.0263, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.4427404403686523, + "rewards/margins": 13.142251968383789, + "rewards/rejected": -15.584993362426758, + "step": 1037 + }, + { + "epoch": 1.58, + "learning_rate": 1.2997023718900352e-07, + "logits/chosen": -0.8932716846466064, + "logits/rejected": -0.770018458366394, + "logps/chosen": -82.77156066894531, + "logps/rejected": -231.54843139648438, + "loss": 0.0215, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.1219401359558105, + "rewards/margins": 12.232475280761719, + "rewards/rejected": -14.354415893554688, + "step": 1038 + }, + { + "epoch": 1.58, + "learning_rate": 1.2907928952848773e-07, + "logits/chosen": -1.12833833694458, + "logits/rejected": -1.050106406211853, + "logps/chosen": -55.28139114379883, + "logps/rejected": -156.7615966796875, + "loss": 0.0439, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.4876255393028259, + "rewards/margins": 9.004829406738281, + "rewards/rejected": -9.492454528808594, + "step": 1039 + }, + { + "epoch": 1.58, + "learning_rate": 1.2819095329063466e-07, + "logits/chosen": -0.8341787457466125, + "logits/rejected": -0.7984386682510376, + "logps/chosen": -90.12042236328125, + "logps/rejected": -218.397216796875, + "loss": 0.0111, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.062849283218384, + "rewards/margins": 11.312891960144043, + "rewards/rejected": -13.375741004943848, + "step": 1040 + }, + { + "epoch": 1.58, + "learning_rate": 1.2730523472966924e-07, + "logits/chosen": -1.3362005949020386, + "logits/rejected": -1.1511693000793457, + "logps/chosen": -72.99873352050781, + "logps/rejected": -211.25875854492188, + "loss": 0.0263, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.2961386442184448, + "rewards/margins": 11.985052108764648, + "rewards/rejected": -13.281190872192383, + "step": 1041 + }, + { + "epoch": 1.58, + "learning_rate": 1.2642214008138642e-07, + "logits/chosen": -1.3697905540466309, + "logits/rejected": -1.3827335834503174, + "logps/chosen": -80.32099151611328, + "logps/rejected": -196.6895294189453, + "loss": 0.0222, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.268894910812378, + "rewards/margins": 10.322794914245605, + "rewards/rejected": -12.591690063476562, + "step": 1042 + }, + { + "epoch": 1.58, + "learning_rate": 1.255416755631078e-07, + "logits/chosen": -1.0065165758132935, + "logits/rejected": -0.987860918045044, + "logps/chosen": -58.82838439941406, + "logps/rejected": -180.65725708007812, + "loss": 0.0108, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.2405383586883545, + "rewards/margins": 9.941941261291504, + "rewards/rejected": -11.182479858398438, + "step": 1043 + }, + { + "epoch": 1.59, + "learning_rate": 1.2466384737363779e-07, + "logits/chosen": -1.1970065832138062, + "logits/rejected": -1.130141258239746, + "logps/chosen": -90.82254028320312, + "logps/rejected": -208.41156005859375, + "loss": 0.0211, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.5388426780700684, + "rewards/margins": 11.361875534057617, + "rewards/rejected": -12.900716781616211, + "step": 1044 + }, + { + "epoch": 1.59, + "learning_rate": 1.2378866169322062e-07, + "logits/chosen": -0.7908114790916443, + "logits/rejected": -0.6253232955932617, + "logps/chosen": -73.36567687988281, + "logps/rejected": -178.9694061279297, + "loss": 0.0172, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.912410855293274, + "rewards/margins": 9.460705757141113, + "rewards/rejected": -11.373116493225098, + "step": 1045 + }, + { + "epoch": 1.59, + "learning_rate": 1.2291612468349554e-07, + "logits/chosen": -1.3395980596542358, + "logits/rejected": -1.2968026399612427, + "logps/chosen": -49.36256408691406, + "logps/rejected": -129.70005798339844, + "loss": 0.0125, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.3536704182624817, + "rewards/margins": 7.407713413238525, + "rewards/rejected": -7.7613844871521, + "step": 1046 + }, + { + "epoch": 1.59, + "learning_rate": 1.220462424874546e-07, + "logits/chosen": -1.2717788219451904, + "logits/rejected": -1.1471407413482666, + "logps/chosen": -82.47772216796875, + "logps/rejected": -215.64578247070312, + "loss": 0.0116, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.8614273071289062, + "rewards/margins": 11.546565055847168, + "rewards/rejected": -13.407992362976074, + "step": 1047 + }, + { + "epoch": 1.59, + "learning_rate": 1.211790212293986e-07, + "logits/chosen": -1.333188772201538, + "logits/rejected": -1.2694209814071655, + "logps/chosen": -83.06002807617188, + "logps/rejected": -161.90350341796875, + "loss": 0.0261, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.4736416339874268, + "rewards/margins": 8.215384483337402, + "rewards/rejected": -9.68902587890625, + "step": 1048 + }, + { + "epoch": 1.59, + "learning_rate": 1.2031446701489478e-07, + "logits/chosen": -1.0990818738937378, + "logits/rejected": -0.9182289838790894, + "logps/chosen": -101.83406829833984, + "logps/rejected": -278.59649658203125, + "loss": 0.0199, + "rewards/accuracies": 0.9375, + "rewards/chosen": -3.2921340465545654, + "rewards/margins": 13.88831901550293, + "rewards/rejected": -17.180452346801758, + "step": 1049 + }, + { + "epoch": 1.6, + "learning_rate": 1.194525859307331e-07, + "logits/chosen": -1.4561948776245117, + "logits/rejected": -1.4749445915222168, + "logps/chosen": -70.3445053100586, + "logps/rejected": -168.951904296875, + "loss": 0.0155, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.176849126815796, + "rewards/margins": 9.659462928771973, + "rewards/rejected": -10.836312294006348, + "step": 1050 + }, + { + "epoch": 1.6, + "learning_rate": 1.1859338404488339e-07, + "logits/chosen": -1.0425996780395508, + "logits/rejected": -0.8686420321464539, + "logps/chosen": -83.05058288574219, + "logps/rejected": -254.65374755859375, + "loss": 0.0098, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.5835621356964111, + "rewards/margins": 15.090636253356934, + "rewards/rejected": -16.6742000579834, + "step": 1051 + }, + { + "epoch": 1.6, + "learning_rate": 1.1773686740645383e-07, + "logits/chosen": -1.0283887386322021, + "logits/rejected": -0.8973429799079895, + "logps/chosen": -69.80270385742188, + "logps/rejected": -225.36180114746094, + "loss": 0.0138, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.8019357323646545, + "rewards/margins": 12.729826927185059, + "rewards/rejected": -13.531761169433594, + "step": 1052 + }, + { + "epoch": 1.6, + "learning_rate": 1.1688304204564614e-07, + "logits/chosen": -1.1241620779037476, + "logits/rejected": -1.2368371486663818, + "logps/chosen": -58.863121032714844, + "logps/rejected": -160.85916137695312, + "loss": 0.0172, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.7072696089744568, + "rewards/margins": 9.642600059509277, + "rewards/rejected": -10.349868774414062, + "step": 1053 + }, + { + "epoch": 1.6, + "learning_rate": 1.1603191397371558e-07, + "logits/chosen": -1.241661548614502, + "logits/rejected": -1.1888474225997925, + "logps/chosen": -77.02825927734375, + "logps/rejected": -177.23056030273438, + "loss": 0.0058, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.189224123954773, + "rewards/margins": 9.070510864257812, + "rewards/rejected": -10.259737014770508, + "step": 1054 + }, + { + "epoch": 1.6, + "learning_rate": 1.1518348918292675e-07, + "logits/chosen": -1.247529149055481, + "logits/rejected": -1.127182960510254, + "logps/chosen": -75.47073364257812, + "logps/rejected": -201.03326416015625, + "loss": 0.0213, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.533700942993164, + "rewards/margins": 10.706246376037598, + "rewards/rejected": -12.239948272705078, + "step": 1055 + }, + { + "epoch": 1.6, + "learning_rate": 1.143377736465127e-07, + "logits/chosen": -1.1370017528533936, + "logits/rejected": -1.0267651081085205, + "logps/chosen": -104.05082702636719, + "logps/rejected": -227.39601135253906, + "loss": 0.0255, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.871164321899414, + "rewards/margins": 11.378036499023438, + "rewards/rejected": -15.249199867248535, + "step": 1056 + }, + { + "epoch": 1.61, + "learning_rate": 1.134947733186315e-07, + "logits/chosen": -1.0723297595977783, + "logits/rejected": -0.9450559616088867, + "logps/chosen": -79.4171142578125, + "logps/rejected": -208.52902221679688, + "loss": 0.0158, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.940129280090332, + "rewards/margins": 10.709497451782227, + "rewards/rejected": -12.649626731872559, + "step": 1057 + }, + { + "epoch": 1.61, + "learning_rate": 1.1265449413432598e-07, + "logits/chosen": -1.2580574750900269, + "logits/rejected": -1.2264395952224731, + "logps/chosen": -71.53569030761719, + "logps/rejected": -161.5540771484375, + "loss": 0.0187, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.8972296714782715, + "rewards/margins": 8.918797492980957, + "rewards/rejected": -10.81602668762207, + "step": 1058 + }, + { + "epoch": 1.61, + "learning_rate": 1.118169420094806e-07, + "logits/chosen": -1.21419095993042, + "logits/rejected": -1.1768602132797241, + "logps/chosen": -63.22785568237305, + "logps/rejected": -188.97410583496094, + "loss": 0.0173, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.7150611877441406, + "rewards/margins": 11.163619995117188, + "rewards/rejected": -11.878681182861328, + "step": 1059 + }, + { + "epoch": 1.61, + "learning_rate": 1.1098212284078035e-07, + "logits/chosen": -0.8862547278404236, + "logits/rejected": -0.7847481369972229, + "logps/chosen": -57.296485900878906, + "logps/rejected": -165.74615478515625, + "loss": 0.0127, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.118424654006958, + "rewards/margins": 9.22490406036377, + "rewards/rejected": -10.343327522277832, + "step": 1060 + }, + { + "epoch": 1.61, + "learning_rate": 1.1015004250566906e-07, + "logits/chosen": -1.0470751523971558, + "logits/rejected": -0.8667370676994324, + "logps/chosen": -92.5534896850586, + "logps/rejected": -254.15220642089844, + "loss": 0.0129, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.8795552253723145, + "rewards/margins": 12.72509479522705, + "rewards/rejected": -15.60464859008789, + "step": 1061 + }, + { + "epoch": 1.61, + "learning_rate": 1.093207068623086e-07, + "logits/chosen": -1.079001784324646, + "logits/rejected": -0.9033374786376953, + "logps/chosen": -91.79341125488281, + "logps/rejected": -251.64300537109375, + "loss": 0.019, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.8336005210876465, + "rewards/margins": 13.242752075195312, + "rewards/rejected": -16.076351165771484, + "step": 1062 + }, + { + "epoch": 1.61, + "learning_rate": 1.0849412174953671e-07, + "logits/chosen": -0.9910807013511658, + "logits/rejected": -0.8828264474868774, + "logps/chosen": -101.47459411621094, + "logps/rejected": -259.4543762207031, + "loss": 0.0363, + "rewards/accuracies": 0.9375, + "rewards/chosen": -3.174511671066284, + "rewards/margins": 12.094611167907715, + "rewards/rejected": -15.269123077392578, + "step": 1063 + }, + { + "epoch": 1.62, + "learning_rate": 1.0767029298682639e-07, + "logits/chosen": -1.1676360368728638, + "logits/rejected": -1.095585584640503, + "logps/chosen": -64.70406341552734, + "logps/rejected": -168.17710876464844, + "loss": 0.0156, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.0001543760299683, + "rewards/margins": 9.639240264892578, + "rewards/rejected": -10.639395713806152, + "step": 1064 + }, + { + "epoch": 1.62, + "learning_rate": 1.0684922637424504e-07, + "logits/chosen": -1.244273066520691, + "logits/rejected": -1.1721899509429932, + "logps/chosen": -85.20843505859375, + "logps/rejected": -225.7948455810547, + "loss": 0.0287, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.6695289611816406, + "rewards/margins": 11.740074157714844, + "rewards/rejected": -14.409602165222168, + "step": 1065 + }, + { + "epoch": 1.62, + "learning_rate": 1.060309276924135e-07, + "logits/chosen": -1.1296035051345825, + "logits/rejected": -1.0875864028930664, + "logps/chosen": -66.95491027832031, + "logps/rejected": -172.0052490234375, + "loss": 0.0366, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.7586946487426758, + "rewards/margins": 9.80600357055664, + "rewards/rejected": -10.564699172973633, + "step": 1066 + }, + { + "epoch": 1.62, + "learning_rate": 1.0521540270246526e-07, + "logits/chosen": -1.3834147453308105, + "logits/rejected": -1.2829577922821045, + "logps/chosen": -77.07857513427734, + "logps/rejected": -214.35385131835938, + "loss": 0.0488, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.8441226482391357, + "rewards/margins": 11.873616218566895, + "rewards/rejected": -12.717740058898926, + "step": 1067 + }, + { + "epoch": 1.62, + "learning_rate": 1.0440265714600571e-07, + "logits/chosen": -1.1304280757904053, + "logits/rejected": -1.0598454475402832, + "logps/chosen": -70.51648712158203, + "logps/rejected": -190.3502197265625, + "loss": 0.0088, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.9114686250686646, + "rewards/margins": 10.219488143920898, + "rewards/rejected": -11.13095760345459, + "step": 1068 + }, + { + "epoch": 1.62, + "learning_rate": 1.0359269674507271e-07, + "logits/chosen": -1.2193621397018433, + "logits/rejected": -1.2017300128936768, + "logps/chosen": -96.5212173461914, + "logps/rejected": -248.94580078125, + "loss": 0.0136, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.753895878791809, + "rewards/margins": 13.867959022521973, + "rewards/rejected": -15.621854782104492, + "step": 1069 + }, + { + "epoch": 1.63, + "learning_rate": 1.0278552720209449e-07, + "logits/chosen": -1.0258492231369019, + "logits/rejected": -0.9258626699447632, + "logps/chosen": -64.7344970703125, + "logps/rejected": -187.8410186767578, + "loss": 0.0422, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.0400198698043823, + "rewards/margins": 11.164020538330078, + "rewards/rejected": -12.204039573669434, + "step": 1070 + }, + { + "epoch": 1.63, + "learning_rate": 1.0198115419985154e-07, + "logits/chosen": -1.2533814907073975, + "logits/rejected": -1.221890926361084, + "logps/chosen": -59.12885284423828, + "logps/rejected": -154.86053466796875, + "loss": 0.0081, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.6402745246887207, + "rewards/margins": 8.871824264526367, + "rewards/rejected": -9.512099266052246, + "step": 1071 + }, + { + "epoch": 1.63, + "learning_rate": 1.0117958340143506e-07, + "logits/chosen": -1.2478764057159424, + "logits/rejected": -1.072830080986023, + "logps/chosen": -79.64214324951172, + "logps/rejected": -229.7861785888672, + "loss": 0.0049, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.148681879043579, + "rewards/margins": 12.207650184631348, + "rewards/rejected": -14.356331825256348, + "step": 1072 + }, + { + "epoch": 1.63, + "learning_rate": 1.0038082045020824e-07, + "logits/chosen": -1.1577354669570923, + "logits/rejected": -1.1522295475006104, + "logps/chosen": -69.2307357788086, + "logps/rejected": -181.38600158691406, + "loss": 0.0496, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.3045772314071655, + "rewards/margins": 10.574820518493652, + "rewards/rejected": -11.879398345947266, + "step": 1073 + }, + { + "epoch": 1.63, + "learning_rate": 9.958487096976504e-08, + "logits/chosen": -1.1740790605545044, + "logits/rejected": -1.0017123222351074, + "logps/chosen": -49.28252029418945, + "logps/rejected": -193.6754913330078, + "loss": 0.0073, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.09034234285354614, + "rewards/margins": 12.283891677856445, + "rewards/rejected": -12.374235153198242, + "step": 1074 + }, + { + "epoch": 1.63, + "learning_rate": 9.87917405638925e-08, + "logits/chosen": -1.1445330381393433, + "logits/rejected": -1.0902642011642456, + "logps/chosen": -71.83496856689453, + "logps/rejected": -209.97894287109375, + "loss": 0.013, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.7673781514167786, + "rewards/margins": 12.23061466217041, + "rewards/rejected": -12.997993469238281, + "step": 1075 + }, + { + "epoch": 1.63, + "learning_rate": 9.800143481652979e-08, + "logits/chosen": -1.0803571939468384, + "logits/rejected": -1.0840669870376587, + "logps/chosen": -81.00335693359375, + "logps/rejected": -198.65615844726562, + "loss": 0.0111, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.196942687034607, + "rewards/margins": 10.90712833404541, + "rewards/rejected": -12.104071617126465, + "step": 1076 + }, + { + "epoch": 1.64, + "learning_rate": 9.721395929172943e-08, + "logits/chosen": -0.961615800857544, + "logits/rejected": -0.988190770149231, + "logps/chosen": -82.76693725585938, + "logps/rejected": -191.41744995117188, + "loss": 0.0079, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.2863515615463257, + "rewards/margins": 11.099027633666992, + "rewards/rejected": -12.38537883758545, + "step": 1077 + }, + { + "epoch": 1.64, + "learning_rate": 9.642931953361805e-08, + "logits/chosen": -1.0419639348983765, + "logits/rejected": -1.0288581848144531, + "logps/chosen": -85.21007537841797, + "logps/rejected": -221.56063842773438, + "loss": 0.0075, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.589748740196228, + "rewards/margins": 12.904745101928711, + "rewards/rejected": -14.49449348449707, + "step": 1078 + }, + { + "epoch": 1.64, + "learning_rate": 9.564752106635781e-08, + "logits/chosen": -0.9632241129875183, + "logits/rejected": -0.9397358894348145, + "logps/chosen": -65.1313705444336, + "logps/rejected": -187.80287170410156, + "loss": 0.0085, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.9772980213165283, + "rewards/margins": 10.745587348937988, + "rewards/rejected": -11.722885131835938, + "step": 1079 + }, + { + "epoch": 1.64, + "learning_rate": 9.48685693941067e-08, + "logits/chosen": -1.1339447498321533, + "logits/rejected": -1.0799775123596191, + "logps/chosen": -83.80926513671875, + "logps/rejected": -219.28671264648438, + "loss": 0.0051, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.5794214010238647, + "rewards/margins": 12.531013488769531, + "rewards/rejected": -14.110435485839844, + "step": 1080 + }, + { + "epoch": 1.64, + "learning_rate": 9.409247000098009e-08, + "logits/chosen": -1.3046482801437378, + "logits/rejected": -1.250767469406128, + "logps/chosen": -91.93920135498047, + "logps/rejected": -236.53927612304688, + "loss": 0.0136, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.001004457473755, + "rewards/margins": 13.450885772705078, + "rewards/rejected": -15.45189094543457, + "step": 1081 + }, + { + "epoch": 1.64, + "learning_rate": 9.33192283510128e-08, + "logits/chosen": -0.9359659552574158, + "logits/rejected": -0.8699568510055542, + "logps/chosen": -102.46256256103516, + "logps/rejected": -237.10525512695312, + "loss": 0.009, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.088146448135376, + "rewards/margins": 12.04832649230957, + "rewards/rejected": -15.136472702026367, + "step": 1082 + }, + { + "epoch": 1.65, + "learning_rate": 9.254884988811951e-08, + "logits/chosen": -1.1467124223709106, + "logits/rejected": -1.0064427852630615, + "logps/chosen": -69.78543853759766, + "logps/rejected": -175.60227966308594, + "loss": 0.0208, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.6399104595184326, + "rewards/margins": 10.142022132873535, + "rewards/rejected": -10.78193187713623, + "step": 1083 + }, + { + "epoch": 1.65, + "learning_rate": 9.17813400360572e-08, + "logits/chosen": -1.236464500427246, + "logits/rejected": -1.1323363780975342, + "logps/chosen": -65.01455688476562, + "logps/rejected": -166.62130737304688, + "loss": 0.0091, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.599092960357666, + "rewards/margins": 8.680694580078125, + "rewards/rejected": -10.279787063598633, + "step": 1084 + }, + { + "epoch": 1.65, + "learning_rate": 9.101670419838652e-08, + "logits/chosen": -1.1149910688400269, + "logits/rejected": -0.9636414647102356, + "logps/chosen": -90.10865783691406, + "logps/rejected": -251.3350067138672, + "loss": 0.019, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.6027493476867676, + "rewards/margins": 13.645480155944824, + "rewards/rejected": -16.24822998046875, + "step": 1085 + }, + { + "epoch": 1.65, + "learning_rate": 9.025494775843456e-08, + "logits/chosen": -1.1713343858718872, + "logits/rejected": -1.116579294204712, + "logps/chosen": -73.0295181274414, + "logps/rejected": -194.81912231445312, + "loss": 0.0137, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.4419115781784058, + "rewards/margins": 11.159817695617676, + "rewards/rejected": -12.601728439331055, + "step": 1086 + }, + { + "epoch": 1.65, + "learning_rate": 8.949607607925541e-08, + "logits/chosen": -0.9148304462432861, + "logits/rejected": -0.8945529460906982, + "logps/chosen": -68.20335388183594, + "logps/rejected": -201.86953735351562, + "loss": 0.0085, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.7701525688171387, + "rewards/margins": 12.294547080993652, + "rewards/rejected": -13.064699172973633, + "step": 1087 + }, + { + "epoch": 1.65, + "learning_rate": 8.874009450359426e-08, + "logits/chosen": -1.4068900346755981, + "logits/rejected": -1.3015477657318115, + "logps/chosen": -82.90198516845703, + "logps/rejected": -216.49276733398438, + "loss": 0.0115, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.9591162204742432, + "rewards/margins": 11.598179817199707, + "rewards/rejected": -13.557294845581055, + "step": 1088 + }, + { + "epoch": 1.65, + "learning_rate": 8.798700835384842e-08, + "logits/chosen": -1.3619306087493896, + "logits/rejected": -1.400967001914978, + "logps/chosen": -77.90013885498047, + "logps/rejected": -219.6149139404297, + "loss": 0.0157, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.714892864227295, + "rewards/margins": 13.306224822998047, + "rewards/rejected": -15.0211181640625, + "step": 1089 + }, + { + "epoch": 1.66, + "learning_rate": 8.723682293203033e-08, + "logits/chosen": -1.1045377254486084, + "logits/rejected": -0.9827411770820618, + "logps/chosen": -74.56683349609375, + "logps/rejected": -211.0465087890625, + "loss": 0.0203, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.3548325300216675, + "rewards/margins": 10.95444107055664, + "rewards/rejected": -12.309274673461914, + "step": 1090 + }, + { + "epoch": 1.66, + "learning_rate": 8.648954351973015e-08, + "logits/chosen": -1.2276021242141724, + "logits/rejected": -1.1442418098449707, + "logps/chosen": -69.00041961669922, + "logps/rejected": -185.90890502929688, + "loss": 0.0146, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.6087831258773804, + "rewards/margins": 11.31124210357666, + "rewards/rejected": -11.920024871826172, + "step": 1091 + }, + { + "epoch": 1.66, + "learning_rate": 8.574517537807896e-08, + "logits/chosen": -1.1629654169082642, + "logits/rejected": -1.0115498304367065, + "logps/chosen": -67.43597412109375, + "logps/rejected": -219.76559448242188, + "loss": 0.0117, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.2888271808624268, + "rewards/margins": 12.611084938049316, + "rewards/rejected": -13.899911880493164, + "step": 1092 + }, + { + "epoch": 1.66, + "learning_rate": 8.500372374771103e-08, + "logits/chosen": -0.9780749082565308, + "logits/rejected": -0.8621728420257568, + "logps/chosen": -70.65216064453125, + "logps/rejected": -178.79026794433594, + "loss": 0.0114, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.0733259916305542, + "rewards/margins": 10.05894660949707, + "rewards/rejected": -11.132272720336914, + "step": 1093 + }, + { + "epoch": 1.66, + "learning_rate": 8.426519384872732e-08, + "logits/chosen": -1.104317307472229, + "logits/rejected": -1.0625383853912354, + "logps/chosen": -55.13422393798828, + "logps/rejected": -145.50135803222656, + "loss": 0.0096, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.2227928638458252, + "rewards/margins": 8.60960865020752, + "rewards/rejected": -9.832401275634766, + "step": 1094 + }, + { + "epoch": 1.66, + "learning_rate": 8.352959088065903e-08, + "logits/chosen": -1.2082970142364502, + "logits/rejected": -1.2114728689193726, + "logps/chosen": -86.84661865234375, + "logps/rejected": -216.3503875732422, + "loss": 0.0207, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.226700782775879, + "rewards/margins": 11.329172134399414, + "rewards/rejected": -13.555872917175293, + "step": 1095 + }, + { + "epoch": 1.67, + "learning_rate": 8.279692002243028e-08, + "logits/chosen": -1.2522339820861816, + "logits/rejected": -1.1314581632614136, + "logps/chosen": -71.96464538574219, + "logps/rejected": -202.87396240234375, + "loss": 0.0173, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.396548867225647, + "rewards/margins": 11.046591758728027, + "rewards/rejected": -12.443140029907227, + "step": 1096 + }, + { + "epoch": 1.67, + "learning_rate": 8.206718643232207e-08, + "logits/chosen": -1.1508793830871582, + "logits/rejected": -1.076686143875122, + "logps/chosen": -70.49022674560547, + "logps/rejected": -211.82797241210938, + "loss": 0.0121, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.335092544555664, + "rewards/margins": 11.92380428314209, + "rewards/rejected": -13.258896827697754, + "step": 1097 + }, + { + "epoch": 1.67, + "learning_rate": 8.134039524793601e-08, + "logits/chosen": -1.2321585416793823, + "logits/rejected": -1.1257213354110718, + "logps/chosen": -91.0789566040039, + "logps/rejected": -230.8169403076172, + "loss": 0.0076, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.274380922317505, + "rewards/margins": 12.497110366821289, + "rewards/rejected": -14.771492004394531, + "step": 1098 + }, + { + "epoch": 1.67, + "learning_rate": 8.061655158615821e-08, + "logits/chosen": -1.4044198989868164, + "logits/rejected": -1.2616177797317505, + "logps/chosen": -73.60513305664062, + "logps/rejected": -224.2547607421875, + "loss": 0.0063, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.9663087129592896, + "rewards/margins": 12.21403694152832, + "rewards/rejected": -14.180344581604004, + "step": 1099 + }, + { + "epoch": 1.67, + "learning_rate": 7.989566054312286e-08, + "logits/chosen": -1.2367472648620605, + "logits/rejected": -1.132240891456604, + "logps/chosen": -68.86763763427734, + "logps/rejected": -181.22474670410156, + "loss": 0.0278, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.8581984043121338, + "rewards/margins": 9.85179615020752, + "rewards/rejected": -11.70999526977539, + "step": 1100 + }, + { + "epoch": 1.67, + "learning_rate": 7.91777271941766e-08, + "logits/chosen": -1.0721819400787354, + "logits/rejected": -1.093714952468872, + "logps/chosen": -72.05963134765625, + "logps/rejected": -216.3887176513672, + "loss": 0.0184, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.9113300442695618, + "rewards/margins": 13.480352401733398, + "rewards/rejected": -14.391682624816895, + "step": 1101 + }, + { + "epoch": 1.67, + "learning_rate": 7.846275659384278e-08, + "logits/chosen": -1.0708647966384888, + "logits/rejected": -1.0220985412597656, + "logps/chosen": -73.78739166259766, + "logps/rejected": -198.44204711914062, + "loss": 0.0089, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.0984382629394531, + "rewards/margins": 12.221569061279297, + "rewards/rejected": -13.320008277893066, + "step": 1102 + }, + { + "epoch": 1.68, + "learning_rate": 7.775075377578633e-08, + "logits/chosen": -1.2051002979278564, + "logits/rejected": -1.12346613407135, + "logps/chosen": -69.5119857788086, + "logps/rejected": -186.70733642578125, + "loss": 0.0081, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.6519767045974731, + "rewards/margins": 10.11180591583252, + "rewards/rejected": -11.763782501220703, + "step": 1103 + }, + { + "epoch": 1.68, + "learning_rate": 7.70417237527769e-08, + "logits/chosen": -1.1896698474884033, + "logits/rejected": -1.0908278226852417, + "logps/chosen": -66.09146881103516, + "logps/rejected": -185.23272705078125, + "loss": 0.0078, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.2258577346801758, + "rewards/margins": 10.83736515045166, + "rewards/rejected": -12.063223838806152, + "step": 1104 + }, + { + "epoch": 1.68, + "learning_rate": 7.63356715166556e-08, + "logits/chosen": -1.2933841943740845, + "logits/rejected": -1.330116629600525, + "logps/chosen": -58.654380798339844, + "logps/rejected": -125.71812438964844, + "loss": 0.0076, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.5604220628738403, + "rewards/margins": 6.564835548400879, + "rewards/rejected": -7.12525749206543, + "step": 1105 + }, + { + "epoch": 1.68, + "learning_rate": 7.563260203829808e-08, + "logits/chosen": -0.9199784994125366, + "logits/rejected": -0.8492215275764465, + "logps/chosen": -66.62592315673828, + "logps/rejected": -181.0642852783203, + "loss": 0.0074, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.576340913772583, + "rewards/margins": 10.062987327575684, + "rewards/rejected": -11.639326095581055, + "step": 1106 + }, + { + "epoch": 1.68, + "learning_rate": 7.49325202675805e-08, + "logits/chosen": -1.0409053564071655, + "logits/rejected": -0.9511809945106506, + "logps/chosen": -58.40936279296875, + "logps/rejected": -156.097412109375, + "loss": 0.0067, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.83051997423172, + "rewards/margins": 8.35158634185791, + "rewards/rejected": -9.182106018066406, + "step": 1107 + }, + { + "epoch": 1.68, + "learning_rate": 7.423543113334435e-08, + "logits/chosen": -1.1387490034103394, + "logits/rejected": -1.1770416498184204, + "logps/chosen": -70.22821807861328, + "logps/rejected": -193.7679443359375, + "loss": 0.0018, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.4090297818183899, + "rewards/margins": 11.673224449157715, + "rewards/rejected": -12.082253456115723, + "step": 1108 + }, + { + "epoch": 1.68, + "learning_rate": 7.3541339543362e-08, + "logits/chosen": -1.0563454627990723, + "logits/rejected": -1.0912545919418335, + "logps/chosen": -85.34751892089844, + "logps/rejected": -231.6593017578125, + "loss": 0.0065, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.5773392915725708, + "rewards/margins": 13.65406608581543, + "rewards/rejected": -15.231405258178711, + "step": 1109 + }, + { + "epoch": 1.69, + "learning_rate": 7.285025038430171e-08, + "logits/chosen": -0.9881210327148438, + "logits/rejected": -0.99253249168396, + "logps/chosen": -40.646854400634766, + "logps/rejected": -143.0417938232422, + "loss": 0.0363, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.40884098410606384, + "rewards/margins": 9.594618797302246, + "rewards/rejected": -9.185776710510254, + "step": 1110 + }, + { + "epoch": 1.69, + "learning_rate": 7.21621685216936e-08, + "logits/chosen": -1.2693352699279785, + "logits/rejected": -1.2725985050201416, + "logps/chosen": -100.3951187133789, + "logps/rejected": -227.0180206298828, + "loss": 0.0122, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.245497226715088, + "rewards/margins": 12.094511032104492, + "rewards/rejected": -14.340007781982422, + "step": 1111 + }, + { + "epoch": 1.69, + "learning_rate": 7.147709879989539e-08, + "logits/chosen": -1.149064064025879, + "logits/rejected": -1.0589556694030762, + "logps/chosen": -77.8072509765625, + "logps/rejected": -206.65069580078125, + "loss": 0.0136, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.5425877571105957, + "rewards/margins": 11.325196266174316, + "rewards/rejected": -12.86778450012207, + "step": 1112 + }, + { + "epoch": 1.69, + "learning_rate": 7.079504604205805e-08, + "logits/chosen": -1.0684857368469238, + "logits/rejected": -0.9011760950088501, + "logps/chosen": -70.0149154663086, + "logps/rejected": -208.98086547851562, + "loss": 0.0321, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.238356590270996, + "rewards/margins": 11.44414234161377, + "rewards/rejected": -12.68249797821045, + "step": 1113 + }, + { + "epoch": 1.69, + "learning_rate": 7.011601505009196e-08, + "logits/chosen": -1.1593823432922363, + "logits/rejected": -1.0746428966522217, + "logps/chosen": -71.18487548828125, + "logps/rejected": -203.63807678222656, + "loss": 0.0076, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.0665754079818726, + "rewards/margins": 11.836189270019531, + "rewards/rejected": -12.902764320373535, + "step": 1114 + }, + { + "epoch": 1.69, + "learning_rate": 6.944001060463311e-08, + "logits/chosen": -0.9904294610023499, + "logits/rejected": -0.9577285647392273, + "logps/chosen": -60.490360260009766, + "logps/rejected": -159.77786254882812, + "loss": 0.0072, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.0130398273468018, + "rewards/margins": 8.982420921325684, + "rewards/rejected": -9.995460510253906, + "step": 1115 + }, + { + "epoch": 1.7, + "learning_rate": 6.876703746500984e-08, + "logits/chosen": -1.1600037813186646, + "logits/rejected": -1.0456393957138062, + "logps/chosen": -50.8457145690918, + "logps/rejected": -146.4720916748047, + "loss": 0.0239, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.8990418910980225, + "rewards/margins": 8.573019981384277, + "rewards/rejected": -9.472062110900879, + "step": 1116 + }, + { + "epoch": 1.7, + "learning_rate": 6.809710036920818e-08, + "logits/chosen": -1.0500625371932983, + "logits/rejected": -1.011877179145813, + "logps/chosen": -75.9246597290039, + "logps/rejected": -194.84063720703125, + "loss": 0.0109, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.75282883644104, + "rewards/margins": 11.360499382019043, + "rewards/rejected": -12.11332893371582, + "step": 1117 + }, + { + "epoch": 1.7, + "learning_rate": 6.743020403383997e-08, + "logits/chosen": -1.1439069509506226, + "logits/rejected": -1.0360243320465088, + "logps/chosen": -84.5653076171875, + "logps/rejected": -237.63262939453125, + "loss": 0.0137, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.4115619659423828, + "rewards/margins": 13.395352363586426, + "rewards/rejected": -14.806913375854492, + "step": 1118 + }, + { + "epoch": 1.7, + "learning_rate": 6.676635315410855e-08, + "logits/chosen": -1.220572829246521, + "logits/rejected": -1.3106917142868042, + "logps/chosen": -59.45014953613281, + "logps/rejected": -133.43246459960938, + "loss": 0.0274, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.6084131002426147, + "rewards/margins": 7.392858505249023, + "rewards/rejected": -8.00127124786377, + "step": 1119 + }, + { + "epoch": 1.7, + "learning_rate": 6.610555240377652e-08, + "logits/chosen": -1.1922484636306763, + "logits/rejected": -1.024374008178711, + "logps/chosen": -75.49510955810547, + "logps/rejected": -239.79537963867188, + "loss": 0.0072, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.1918355226516724, + "rewards/margins": 13.71989631652832, + "rewards/rejected": -14.911730766296387, + "step": 1120 + }, + { + "epoch": 1.7, + "learning_rate": 6.544780643513159e-08, + "logits/chosen": -1.0872753858566284, + "logits/rejected": -0.9683758020401001, + "logps/chosen": -75.62218475341797, + "logps/rejected": -247.2213592529297, + "loss": 0.0067, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.1390029191970825, + "rewards/margins": 15.164216995239258, + "rewards/rejected": -16.303220748901367, + "step": 1121 + }, + { + "epoch": 1.7, + "learning_rate": 6.479311987895558e-08, + "logits/chosen": -1.113074779510498, + "logits/rejected": -1.0629303455352783, + "logps/chosen": -73.87572479248047, + "logps/rejected": -216.52003479003906, + "loss": 0.0138, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.9578240513801575, + "rewards/margins": 12.23649787902832, + "rewards/rejected": -13.19432258605957, + "step": 1122 + }, + { + "epoch": 1.71, + "learning_rate": 6.414149734449037e-08, + "logits/chosen": -1.2681244611740112, + "logits/rejected": -1.3629964590072632, + "logps/chosen": -83.48455810546875, + "logps/rejected": -167.93890380859375, + "loss": 0.0042, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.735581874847412, + "rewards/margins": 9.227690696716309, + "rewards/rejected": -10.963273048400879, + "step": 1123 + }, + { + "epoch": 1.71, + "learning_rate": 6.349294341940592e-08, + "logits/chosen": -1.1680818796157837, + "logits/rejected": -1.0755884647369385, + "logps/chosen": -76.60441589355469, + "logps/rejected": -218.47650146484375, + "loss": 0.0165, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.6303492784500122, + "rewards/margins": 11.845648765563965, + "rewards/rejected": -13.475997924804688, + "step": 1124 + }, + { + "epoch": 1.71, + "learning_rate": 6.284746266976832e-08, + "logits/chosen": -1.0293267965316772, + "logits/rejected": -1.013458251953125, + "logps/chosen": -74.43067169189453, + "logps/rejected": -183.85137939453125, + "loss": 0.0048, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.1002659797668457, + "rewards/margins": 9.507320404052734, + "rewards/rejected": -11.607585906982422, + "step": 1125 + }, + { + "epoch": 1.71, + "learning_rate": 6.220505964000716e-08, + "logits/chosen": -1.0351016521453857, + "logits/rejected": -0.8751451373100281, + "logps/chosen": -87.07731628417969, + "logps/rejected": -228.90122985839844, + "loss": 0.0039, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.9597899913787842, + "rewards/margins": 12.763002395629883, + "rewards/rejected": -14.722792625427246, + "step": 1126 + }, + { + "epoch": 1.71, + "learning_rate": 6.156573885288375e-08, + "logits/chosen": -1.4418346881866455, + "logits/rejected": -1.2614383697509766, + "logps/chosen": -57.170623779296875, + "logps/rejected": -194.32440185546875, + "loss": 0.0088, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.6441372632980347, + "rewards/margins": 10.866323471069336, + "rewards/rejected": -11.510459899902344, + "step": 1127 + }, + { + "epoch": 1.71, + "learning_rate": 6.092950480945897e-08, + "logits/chosen": -1.3853294849395752, + "logits/rejected": -1.2317214012145996, + "logps/chosen": -58.2214469909668, + "logps/rejected": -191.49949645996094, + "loss": 0.0062, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.45231327414512634, + "rewards/margins": 11.297001838684082, + "rewards/rejected": -11.749314308166504, + "step": 1128 + }, + { + "epoch": 1.72, + "learning_rate": 6.029636198906246e-08, + "logits/chosen": -1.1470444202423096, + "logits/rejected": -1.064419150352478, + "logps/chosen": -82.69210815429688, + "logps/rejected": -216.30880737304688, + "loss": 0.0072, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.6808559894561768, + "rewards/margins": 12.330316543579102, + "rewards/rejected": -14.011174201965332, + "step": 1129 + }, + { + "epoch": 1.72, + "learning_rate": 5.966631484925993e-08, + "logits/chosen": -1.0834418535232544, + "logits/rejected": -1.071006178855896, + "logps/chosen": -87.85391998291016, + "logps/rejected": -242.10052490234375, + "loss": 0.0166, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.9494872093200684, + "rewards/margins": 14.41417121887207, + "rewards/rejected": -16.363658905029297, + "step": 1130 + }, + { + "epoch": 1.72, + "learning_rate": 5.9039367825822526e-08, + "logits/chosen": -1.2992407083511353, + "logits/rejected": -1.2516363859176636, + "logps/chosen": -83.01346588134766, + "logps/rejected": -220.72406005859375, + "loss": 0.014, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.455176830291748, + "rewards/margins": 12.305998802185059, + "rewards/rejected": -13.761176109313965, + "step": 1131 + }, + { + "epoch": 1.72, + "learning_rate": 5.8415525332695334e-08, + "logits/chosen": -1.2400323152542114, + "logits/rejected": -1.147408366203308, + "logps/chosen": -70.5896224975586, + "logps/rejected": -168.30020141601562, + "loss": 0.0106, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.8818989992141724, + "rewards/margins": 7.773576736450195, + "rewards/rejected": -9.655476570129395, + "step": 1132 + }, + { + "epoch": 1.72, + "learning_rate": 5.7794791761966664e-08, + "logits/chosen": -1.2419061660766602, + "logits/rejected": -1.086823582649231, + "logps/chosen": -75.44318389892578, + "logps/rejected": -233.98483276367188, + "loss": 0.0114, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.3124146461486816, + "rewards/margins": 13.510801315307617, + "rewards/rejected": -14.823214530944824, + "step": 1133 + }, + { + "epoch": 1.72, + "learning_rate": 5.717717148383616e-08, + "logits/chosen": -0.999576210975647, + "logits/rejected": -0.9369627833366394, + "logps/chosen": -82.40902709960938, + "logps/rejected": -209.4818572998047, + "loss": 0.0093, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.0064735412597656, + "rewards/margins": 10.635801315307617, + "rewards/rejected": -13.642273902893066, + "step": 1134 + }, + { + "epoch": 1.72, + "learning_rate": 5.6562668846585504e-08, + "logits/chosen": -1.1111935377120972, + "logits/rejected": -1.075188159942627, + "logps/chosen": -64.33235931396484, + "logps/rejected": -160.79803466796875, + "loss": 0.0159, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.1467089653015137, + "rewards/margins": 9.329978942871094, + "rewards/rejected": -10.476689338684082, + "step": 1135 + }, + { + "epoch": 1.73, + "learning_rate": 5.595128817654637e-08, + "logits/chosen": -1.1287126541137695, + "logits/rejected": -1.0575056076049805, + "logps/chosen": -65.08252716064453, + "logps/rejected": -173.18484497070312, + "loss": 0.0115, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.643226981163025, + "rewards/margins": 9.848438262939453, + "rewards/rejected": -11.491665840148926, + "step": 1136 + }, + { + "epoch": 1.73, + "learning_rate": 5.53430337780712e-08, + "logits/chosen": -1.230764389038086, + "logits/rejected": -1.082259178161621, + "logps/chosen": -75.74681854248047, + "logps/rejected": -228.3786163330078, + "loss": 0.0152, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.030249834060669, + "rewards/margins": 13.466056823730469, + "rewards/rejected": -14.496305465698242, + "step": 1137 + }, + { + "epoch": 1.73, + "learning_rate": 5.473790993350152e-08, + "logits/chosen": -1.206829309463501, + "logits/rejected": -1.0737767219543457, + "logps/chosen": -64.42081451416016, + "logps/rejected": -179.85597229003906, + "loss": 0.014, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.1375069618225098, + "rewards/margins": 10.590378761291504, + "rewards/rejected": -11.727885246276855, + "step": 1138 + }, + { + "epoch": 1.73, + "learning_rate": 5.413592090313929e-08, + "logits/chosen": -1.1952977180480957, + "logits/rejected": -1.1582542657852173, + "logps/chosen": -74.84733581542969, + "logps/rejected": -187.56007385253906, + "loss": 0.0141, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.404332160949707, + "rewards/margins": 10.64260482788086, + "rewards/rejected": -12.04693603515625, + "step": 1139 + }, + { + "epoch": 1.73, + "learning_rate": 5.353707092521581e-08, + "logits/chosen": -1.2151238918304443, + "logits/rejected": -1.201400637626648, + "logps/chosen": -78.33900451660156, + "logps/rejected": -170.19369506835938, + "loss": 0.0052, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.1412068605422974, + "rewards/margins": 9.702839851379395, + "rewards/rejected": -10.844047546386719, + "step": 1140 + }, + { + "epoch": 1.73, + "learning_rate": 5.294136421586226e-08, + "logits/chosen": -1.2093555927276611, + "logits/rejected": -1.1871004104614258, + "logps/chosen": -73.68685913085938, + "logps/rejected": -179.29037475585938, + "loss": 0.0089, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.404720425605774, + "rewards/margins": 9.682507514953613, + "rewards/rejected": -11.087226867675781, + "step": 1141 + }, + { + "epoch": 1.73, + "learning_rate": 5.234880496908028e-08, + "logits/chosen": -0.8909991979598999, + "logits/rejected": -0.8302620649337769, + "logps/chosen": -81.54849243164062, + "logps/rejected": -219.3596649169922, + "loss": 0.0047, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.5307843685150146, + "rewards/margins": 12.892635345458984, + "rewards/rejected": -14.423418045043945, + "step": 1142 + }, + { + "epoch": 1.74, + "learning_rate": 5.175939735671186e-08, + "logits/chosen": -0.9654414057731628, + "logits/rejected": -0.8610618114471436, + "logps/chosen": -61.41965866088867, + "logps/rejected": -190.81536865234375, + "loss": 0.0095, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.285893201828003, + "rewards/margins": 11.655067443847656, + "rewards/rejected": -12.940960884094238, + "step": 1143 + }, + { + "epoch": 1.74, + "learning_rate": 5.1173145528410513e-08, + "logits/chosen": -1.44536554813385, + "logits/rejected": -1.3629040718078613, + "logps/chosen": -68.13104248046875, + "logps/rejected": -177.6975555419922, + "loss": 0.0097, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.9723241329193115, + "rewards/margins": 10.045774459838867, + "rewards/rejected": -11.018098831176758, + "step": 1144 + }, + { + "epoch": 1.74, + "learning_rate": 5.059005361161156e-08, + "logits/chosen": -1.289172649383545, + "logits/rejected": -1.2248356342315674, + "logps/chosen": -86.52054595947266, + "logps/rejected": -204.8133087158203, + "loss": 0.0103, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.5036221742630005, + "rewards/margins": 10.382984161376953, + "rewards/rejected": -11.886605262756348, + "step": 1145 + }, + { + "epoch": 1.74, + "learning_rate": 5.001012571150382e-08, + "logits/chosen": -1.1975736618041992, + "logits/rejected": -1.1259053945541382, + "logps/chosen": -63.912940979003906, + "logps/rejected": -180.1395263671875, + "loss": 0.0116, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.7342360019683838, + "rewards/margins": 10.14150619506836, + "rewards/rejected": -11.875743865966797, + "step": 1146 + }, + { + "epoch": 1.74, + "learning_rate": 4.943336591099978e-08, + "logits/chosen": -1.1472722291946411, + "logits/rejected": -1.1231780052185059, + "logps/chosen": -49.86153793334961, + "logps/rejected": -123.01583099365234, + "loss": 0.0163, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.19975323975086212, + "rewards/margins": 7.285927772521973, + "rewards/rejected": -7.485680103302002, + "step": 1147 + }, + { + "epoch": 1.74, + "learning_rate": 4.885977827070747e-08, + "logits/chosen": -1.2718511819839478, + "logits/rejected": -1.2030029296875, + "logps/chosen": -71.72444915771484, + "logps/rejected": -181.01756286621094, + "loss": 0.018, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.0611457824707031, + "rewards/margins": 9.881611824035645, + "rewards/rejected": -10.942757606506348, + "step": 1148 + }, + { + "epoch": 1.75, + "learning_rate": 4.828936682890161e-08, + "logits/chosen": -1.3181617259979248, + "logits/rejected": -1.2724326848983765, + "logps/chosen": -92.13249206542969, + "logps/rejected": -233.87696838378906, + "loss": 0.0031, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.3504269123077393, + "rewards/margins": 12.301064491271973, + "rewards/rejected": -14.651491165161133, + "step": 1149 + }, + { + "epoch": 1.75, + "learning_rate": 4.772213560149568e-08, + "logits/chosen": -1.194939136505127, + "logits/rejected": -1.1765731573104858, + "logps/chosen": -91.87400817871094, + "logps/rejected": -207.25135803222656, + "loss": 0.0064, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.496976852416992, + "rewards/margins": 10.776562690734863, + "rewards/rejected": -13.273541450500488, + "step": 1150 + }, + { + "epoch": 1.75, + "learning_rate": 4.715808858201254e-08, + "logits/chosen": -1.073747992515564, + "logits/rejected": -0.9684121608734131, + "logps/chosen": -70.1568603515625, + "logps/rejected": -192.5471954345703, + "loss": 0.0021, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.241762399673462, + "rewards/margins": 11.266307830810547, + "rewards/rejected": -12.50806999206543, + "step": 1151 + }, + { + "epoch": 1.75, + "learning_rate": 4.6597229741557666e-08, + "logits/chosen": -1.3962565660476685, + "logits/rejected": -1.2379064559936523, + "logps/chosen": -95.88238525390625, + "logps/rejected": -254.2093505859375, + "loss": 0.0086, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.0294032096862793, + "rewards/margins": 13.60114860534668, + "rewards/rejected": -15.630552291870117, + "step": 1152 + }, + { + "epoch": 1.75, + "learning_rate": 4.603956302879025e-08, + "logits/chosen": -1.119269847869873, + "logits/rejected": -1.151871681213379, + "logps/chosen": -55.78212356567383, + "logps/rejected": -155.22653198242188, + "loss": 0.0228, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.0021774768829346, + "rewards/margins": 9.199004173278809, + "rewards/rejected": -10.201181411743164, + "step": 1153 + }, + { + "epoch": 1.75, + "learning_rate": 4.548509236989562e-08, + "logits/chosen": -1.0740209817886353, + "logits/rejected": -1.0154675245285034, + "logps/chosen": -56.02338409423828, + "logps/rejected": -152.66915893554688, + "loss": 0.0118, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.5767076015472412, + "rewards/margins": 9.372485160827637, + "rewards/rejected": -9.949193000793457, + "step": 1154 + }, + { + "epoch": 1.75, + "learning_rate": 4.4933821668557914e-08, + "logits/chosen": -1.001431941986084, + "logits/rejected": -0.9505981206893921, + "logps/chosen": -68.77754974365234, + "logps/rejected": -164.1874542236328, + "loss": 0.0132, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.1711878776550293, + "rewards/margins": 8.695853233337402, + "rewards/rejected": -9.867039680480957, + "step": 1155 + }, + { + "epoch": 1.76, + "learning_rate": 4.4385754805932095e-08, + "logits/chosen": -1.2005764245986938, + "logits/rejected": -1.1242300271987915, + "logps/chosen": -76.149658203125, + "logps/rejected": -185.9651336669922, + "loss": 0.0072, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.9307671785354614, + "rewards/margins": 9.996943473815918, + "rewards/rejected": -11.927711486816406, + "step": 1156 + }, + { + "epoch": 1.76, + "learning_rate": 4.384089564061727e-08, + "logits/chosen": -1.205551028251648, + "logits/rejected": -1.0542964935302734, + "logps/chosen": -87.11665344238281, + "logps/rejected": -245.56309509277344, + "loss": 0.002, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.1137125492095947, + "rewards/margins": 13.458393096923828, + "rewards/rejected": -15.57210636138916, + "step": 1157 + }, + { + "epoch": 1.76, + "learning_rate": 4.3299248008628495e-08, + "logits/chosen": -1.346709132194519, + "logits/rejected": -1.3536221981048584, + "logps/chosen": -71.81742858886719, + "logps/rejected": -193.46884155273438, + "loss": 0.0051, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.651132345199585, + "rewards/margins": 11.359602928161621, + "rewards/rejected": -13.010734558105469, + "step": 1158 + }, + { + "epoch": 1.76, + "learning_rate": 4.276081572337109e-08, + "logits/chosen": -1.2646945714950562, + "logits/rejected": -1.179612398147583, + "logps/chosen": -61.9714469909668, + "logps/rejected": -169.14645385742188, + "loss": 0.0083, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.9661446809768677, + "rewards/margins": 9.379840850830078, + "rewards/rejected": -10.345985412597656, + "step": 1159 + }, + { + "epoch": 1.76, + "learning_rate": 4.222560257561275e-08, + "logits/chosen": -0.9648147225379944, + "logits/rejected": -1.0111454725265503, + "logps/chosen": -108.44926452636719, + "logps/rejected": -273.5411071777344, + "loss": 0.0101, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.9361164569854736, + "rewards/margins": 14.968153953552246, + "rewards/rejected": -17.90427017211914, + "step": 1160 + }, + { + "epoch": 1.76, + "learning_rate": 4.169361233345725e-08, + "logits/chosen": -0.99238121509552, + "logits/rejected": -0.9608970880508423, + "logps/chosen": -41.922630310058594, + "logps/rejected": -153.88870239257812, + "loss": 0.0039, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.029361620545387268, + "rewards/margins": 10.260735511779785, + "rewards/rejected": -10.231374740600586, + "step": 1161 + }, + { + "epoch": 1.77, + "learning_rate": 4.116484874231785e-08, + "logits/chosen": -1.1681238412857056, + "logits/rejected": -1.0290806293487549, + "logps/chosen": -112.41152954101562, + "logps/rejected": -255.91331481933594, + "loss": 0.0194, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.348160743713379, + "rewards/margins": 12.28393268585205, + "rewards/rejected": -16.63209342956543, + "step": 1162 + }, + { + "epoch": 1.77, + "learning_rate": 4.0639315524891304e-08, + "logits/chosen": -0.9522886276245117, + "logits/rejected": -0.9439799189567566, + "logps/chosen": -68.50711822509766, + "logps/rejected": -158.6794891357422, + "loss": 0.0198, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.7130775451660156, + "rewards/margins": 9.161654472351074, + "rewards/rejected": -9.874731063842773, + "step": 1163 + }, + { + "epoch": 1.77, + "learning_rate": 4.0117016381130634e-08, + "logits/chosen": -1.0935475826263428, + "logits/rejected": -0.9440351128578186, + "logps/chosen": -112.45979309082031, + "logps/rejected": -252.8640594482422, + "loss": 0.0079, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.46082878112793, + "rewards/margins": 12.47606372833252, + "rewards/rejected": -16.936891555786133, + "step": 1164 + }, + { + "epoch": 1.77, + "learning_rate": 3.959795498822055e-08, + "logits/chosen": -1.3229010105133057, + "logits/rejected": -1.2653900384902954, + "logps/chosen": -63.48313522338867, + "logps/rejected": -165.02943420410156, + "loss": 0.0068, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.9819265604019165, + "rewards/margins": 8.870457649230957, + "rewards/rejected": -9.852384567260742, + "step": 1165 + }, + { + "epoch": 1.77, + "learning_rate": 3.9082135000550074e-08, + "logits/chosen": -1.1480696201324463, + "logits/rejected": -1.058894395828247, + "logps/chosen": -76.19339752197266, + "logps/rejected": -244.94894409179688, + "loss": 0.0055, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.2183040380477905, + "rewards/margins": 14.837517738342285, + "rewards/rejected": -16.05582046508789, + "step": 1166 + }, + { + "epoch": 1.77, + "learning_rate": 3.856956004968803e-08, + "logits/chosen": -1.290866494178772, + "logits/rejected": -1.3397711515426636, + "logps/chosen": -90.19656372070312, + "logps/rejected": -180.97918701171875, + "loss": 0.0086, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.1676547527313232, + "rewards/margins": 8.439589500427246, + "rewards/rejected": -11.607244491577148, + "step": 1167 + }, + { + "epoch": 1.77, + "learning_rate": 3.806023374435663e-08, + "logits/chosen": -1.1712409257888794, + "logits/rejected": -1.1349647045135498, + "logps/chosen": -90.69806671142578, + "logps/rejected": -242.47714233398438, + "loss": 0.0042, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.2193498611450195, + "rewards/margins": 13.698222160339355, + "rewards/rejected": -15.917571067810059, + "step": 1168 + }, + { + "epoch": 1.78, + "learning_rate": 3.7554159670406535e-08, + "logits/chosen": -1.2209597826004028, + "logits/rejected": -1.041914701461792, + "logps/chosen": -82.41712951660156, + "logps/rejected": -221.45584106445312, + "loss": 0.0064, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.1218905448913574, + "rewards/margins": 12.437333106994629, + "rewards/rejected": -14.559225082397461, + "step": 1169 + }, + { + "epoch": 1.78, + "learning_rate": 3.705134139079136e-08, + "logits/chosen": -1.0915639400482178, + "logits/rejected": -0.9363446235656738, + "logps/chosen": -62.273983001708984, + "logps/rejected": -199.3795623779297, + "loss": 0.0044, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.6483842730522156, + "rewards/margins": 12.303706169128418, + "rewards/rejected": -12.9520902633667, + "step": 1170 + }, + { + "epoch": 1.78, + "learning_rate": 3.655178244554269e-08, + "logits/chosen": -1.1844284534454346, + "logits/rejected": -1.2165160179138184, + "logps/chosen": -66.18008422851562, + "logps/rejected": -159.49417114257812, + "loss": 0.0256, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.4386379718780518, + "rewards/margins": 9.355382919311523, + "rewards/rejected": -10.794020652770996, + "step": 1171 + }, + { + "epoch": 1.78, + "learning_rate": 3.6055486351745324e-08, + "logits/chosen": -1.2054702043533325, + "logits/rejected": -1.1438997983932495, + "logps/chosen": -80.12940216064453, + "logps/rejected": -199.4840545654297, + "loss": 0.0047, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.3853845596313477, + "rewards/margins": 11.196819305419922, + "rewards/rejected": -12.58220386505127, + "step": 1172 + }, + { + "epoch": 1.78, + "learning_rate": 3.55624566035122e-08, + "logits/chosen": -1.2264925241470337, + "logits/rejected": -1.1079483032226562, + "logps/chosen": -86.93685150146484, + "logps/rejected": -231.08782958984375, + "loss": 0.0093, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.1986989974975586, + "rewards/margins": 12.410072326660156, + "rewards/rejected": -14.608769416809082, + "step": 1173 + }, + { + "epoch": 1.78, + "learning_rate": 3.507269667196005e-08, + "logits/chosen": -1.0240446329116821, + "logits/rejected": -0.9470053315162659, + "logps/chosen": -52.67414093017578, + "logps/rejected": -150.68316650390625, + "loss": 0.0103, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.0014339275658130646, + "rewards/margins": 9.25440788269043, + "rewards/rejected": -9.252973556518555, + "step": 1174 + }, + { + "epoch": 1.79, + "learning_rate": 3.458621000518458e-08, + "logits/chosen": -1.119564175605774, + "logits/rejected": -1.0477330684661865, + "logps/chosen": -73.17056274414062, + "logps/rejected": -179.02903747558594, + "loss": 0.0111, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.6565520763397217, + "rewards/margins": 9.676838874816895, + "rewards/rejected": -11.333391189575195, + "step": 1175 + }, + { + "epoch": 1.79, + "learning_rate": 3.4103000028236906e-08, + "logits/chosen": -1.2833119630813599, + "logits/rejected": -1.1382286548614502, + "logps/chosen": -79.15277862548828, + "logps/rejected": -245.55087280273438, + "loss": 0.0034, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.2422773838043213, + "rewards/margins": 13.866373062133789, + "rewards/rejected": -15.108651161193848, + "step": 1176 + }, + { + "epoch": 1.79, + "learning_rate": 3.362307014309868e-08, + "logits/chosen": -1.1669962406158447, + "logits/rejected": -1.1485800743103027, + "logps/chosen": -77.07050323486328, + "logps/rejected": -199.91168212890625, + "loss": 0.0146, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.5542712211608887, + "rewards/margins": 11.39111328125, + "rewards/rejected": -12.945383071899414, + "step": 1177 + }, + { + "epoch": 1.79, + "learning_rate": 3.31464237286585e-08, + "logits/chosen": -0.9818435907363892, + "logits/rejected": -0.9378212690353394, + "logps/chosen": -79.21688079833984, + "logps/rejected": -173.23800659179688, + "loss": 0.0077, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.5746930837631226, + "rewards/margins": 9.1552095413208, + "rewards/rejected": -10.729902267456055, + "step": 1178 + }, + { + "epoch": 1.79, + "learning_rate": 3.26730641406881e-08, + "logits/chosen": -1.07041335105896, + "logits/rejected": -1.0159380435943604, + "logps/chosen": -66.49639129638672, + "logps/rejected": -174.3088836669922, + "loss": 0.0081, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.8144354820251465, + "rewards/margins": 9.70686149597168, + "rewards/rejected": -10.521297454833984, + "step": 1179 + }, + { + "epoch": 1.79, + "learning_rate": 3.220299471181898e-08, + "logits/chosen": -1.0575075149536133, + "logits/rejected": -0.9967933893203735, + "logps/chosen": -54.594669342041016, + "logps/rejected": -144.82998657226562, + "loss": 0.035, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.07825981825590134, + "rewards/margins": 8.32921314239502, + "rewards/rejected": -8.407472610473633, + "step": 1180 + }, + { + "epoch": 1.79, + "learning_rate": 3.173621875151811e-08, + "logits/chosen": -1.2280124425888062, + "logits/rejected": -1.182820439338684, + "logps/chosen": -94.21824645996094, + "logps/rejected": -223.11990356445312, + "loss": 0.0107, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.3519139289855957, + "rewards/margins": 12.459220886230469, + "rewards/rejected": -14.811135292053223, + "step": 1181 + }, + { + "epoch": 1.8, + "learning_rate": 3.127273954606574e-08, + "logits/chosen": -1.0089362859725952, + "logits/rejected": -0.96539306640625, + "logps/chosen": -80.09423065185547, + "logps/rejected": -173.8900909423828, + "loss": 0.0077, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.111353874206543, + "rewards/margins": 8.772262573242188, + "rewards/rejected": -10.88361644744873, + "step": 1182 + }, + { + "epoch": 1.8, + "learning_rate": 3.0812560358531356e-08, + "logits/chosen": -1.041933536529541, + "logits/rejected": -1.019261360168457, + "logps/chosen": -63.75693130493164, + "logps/rejected": -154.80682373046875, + "loss": 0.0314, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.1399602890014648, + "rewards/margins": 8.418811798095703, + "rewards/rejected": -9.558772087097168, + "step": 1183 + }, + { + "epoch": 1.8, + "learning_rate": 3.035568442875136e-08, + "logits/chosen": -1.0853257179260254, + "logits/rejected": -0.9606943130493164, + "logps/chosen": -90.47114562988281, + "logps/rejected": -224.79037475585938, + "loss": 0.0356, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.9242072105407715, + "rewards/margins": 11.84377384185791, + "rewards/rejected": -13.76798152923584, + "step": 1184 + }, + { + "epoch": 1.8, + "learning_rate": 2.990211497330569e-08, + "logits/chosen": -1.3671926259994507, + "logits/rejected": -1.3777002096176147, + "logps/chosen": -90.4779281616211, + "logps/rejected": -193.55160522460938, + "loss": 0.0231, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.6756185293197632, + "rewards/margins": 10.614287376403809, + "rewards/rejected": -12.28990650177002, + "step": 1185 + }, + { + "epoch": 1.8, + "learning_rate": 2.9451855185495532e-08, + "logits/chosen": -0.9833270311355591, + "logits/rejected": -0.930718719959259, + "logps/chosen": -58.677024841308594, + "logps/rejected": -166.68373107910156, + "loss": 0.0098, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.30341848731040955, + "rewards/margins": 10.041521072387695, + "rewards/rejected": -10.344938278198242, + "step": 1186 + }, + { + "epoch": 1.8, + "learning_rate": 2.9004908235320924e-08, + "logits/chosen": -1.1783056259155273, + "logits/rejected": -1.1107795238494873, + "logps/chosen": -63.97560501098633, + "logps/rejected": -172.8702850341797, + "loss": 0.017, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.8006558418273926, + "rewards/margins": 9.48755168914795, + "rewards/rejected": -10.2882080078125, + "step": 1187 + }, + { + "epoch": 1.8, + "learning_rate": 2.8561277269457895e-08, + "logits/chosen": -0.9910872578620911, + "logits/rejected": -0.9650610685348511, + "logps/chosen": -75.24369812011719, + "logps/rejected": -183.6258544921875, + "loss": 0.0169, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.5646166801452637, + "rewards/margins": 9.786337852478027, + "rewards/rejected": -12.350954055786133, + "step": 1188 + }, + { + "epoch": 1.81, + "learning_rate": 2.8120965411237152e-08, + "logits/chosen": -0.8922029137611389, + "logits/rejected": -0.8234670162200928, + "logps/chosen": -55.13329315185547, + "logps/rejected": -152.74169921875, + "loss": 0.006, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.9872815608978271, + "rewards/margins": 9.220758438110352, + "rewards/rejected": -10.208039283752441, + "step": 1189 + }, + { + "epoch": 1.81, + "learning_rate": 2.768397576062126e-08, + "logits/chosen": -1.1451364755630493, + "logits/rejected": -0.9374922513961792, + "logps/chosen": -97.60246276855469, + "logps/rejected": -283.8183288574219, + "loss": 0.0051, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.52813982963562, + "rewards/margins": 14.909027099609375, + "rewards/rejected": -17.437166213989258, + "step": 1190 + }, + { + "epoch": 1.81, + "learning_rate": 2.7250311394183378e-08, + "logits/chosen": -1.03672194480896, + "logits/rejected": -0.9891068339347839, + "logps/chosen": -84.0450210571289, + "logps/rejected": -193.334716796875, + "loss": 0.005, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.645227313041687, + "rewards/margins": 10.940584182739258, + "rewards/rejected": -12.585811614990234, + "step": 1191 + }, + { + "epoch": 1.81, + "learning_rate": 2.6819975365085236e-08, + "logits/chosen": -1.0003464221954346, + "logits/rejected": -0.9259211421012878, + "logps/chosen": -62.66392135620117, + "logps/rejected": -171.12786865234375, + "loss": 0.0189, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.6360028386116028, + "rewards/margins": 10.064746856689453, + "rewards/rejected": -10.700749397277832, + "step": 1192 + }, + { + "epoch": 1.81, + "learning_rate": 2.6392970703056018e-08, + "logits/chosen": -1.2499942779541016, + "logits/rejected": -1.0891692638397217, + "logps/chosen": -59.001625061035156, + "logps/rejected": -193.19635009765625, + "loss": 0.0033, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.1801376342773438, + "rewards/margins": 10.908477783203125, + "rewards/rejected": -12.088614463806152, + "step": 1193 + }, + { + "epoch": 1.81, + "learning_rate": 2.596930041437062e-08, + "logits/chosen": -1.2095203399658203, + "logits/rejected": -1.0741009712219238, + "logps/chosen": -54.60664749145508, + "logps/rejected": -171.01107788085938, + "loss": 0.0148, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.6850531101226807, + "rewards/margins": 10.063095092773438, + "rewards/rejected": -10.748147964477539, + "step": 1194 + }, + { + "epoch": 1.82, + "learning_rate": 2.554896748182883e-08, + "logits/chosen": -1.0899300575256348, + "logits/rejected": -1.0272502899169922, + "logps/chosen": -49.843116760253906, + "logps/rejected": -144.82833862304688, + "loss": 0.0083, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.2501094341278076, + "rewards/margins": 9.574674606323242, + "rewards/rejected": -9.324565887451172, + "step": 1195 + }, + { + "epoch": 1.82, + "learning_rate": 2.513197486473406e-08, + "logits/chosen": -1.2003647089004517, + "logits/rejected": -1.0822029113769531, + "logps/chosen": -77.27375793457031, + "logps/rejected": -232.38385009765625, + "loss": 0.0022, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.034325122833252, + "rewards/margins": 13.187532424926758, + "rewards/rejected": -15.221856117248535, + "step": 1196 + }, + { + "epoch": 1.82, + "learning_rate": 2.471832549887276e-08, + "logits/chosen": -1.4846813678741455, + "logits/rejected": -1.3707112073898315, + "logps/chosen": -84.6400375366211, + "logps/rejected": -253.81854248046875, + "loss": 0.0019, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.6491315364837646, + "rewards/margins": 15.211004257202148, + "rewards/rejected": -16.860137939453125, + "step": 1197 + }, + { + "epoch": 1.82, + "learning_rate": 2.430802229649348e-08, + "logits/chosen": -1.04225754737854, + "logits/rejected": -1.037789225578308, + "logps/chosen": -79.18376159667969, + "logps/rejected": -171.23422241210938, + "loss": 0.0199, + "rewards/accuracies": 0.9375, + "rewards/chosen": -2.4485316276550293, + "rewards/margins": 8.454278945922852, + "rewards/rejected": -10.902810096740723, + "step": 1198 + }, + { + "epoch": 1.82, + "learning_rate": 2.390106814628662e-08, + "logits/chosen": -0.9821594953536987, + "logits/rejected": -0.9214787483215332, + "logps/chosen": -75.97613525390625, + "logps/rejected": -203.15115356445312, + "loss": 0.0152, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.9587711095809937, + "rewards/margins": 11.724018096923828, + "rewards/rejected": -12.68278980255127, + "step": 1199 + }, + { + "epoch": 1.82, + "learning_rate": 2.3497465913364046e-08, + "logits/chosen": -1.104460597038269, + "logits/rejected": -1.038111686706543, + "logps/chosen": -89.49078369140625, + "logps/rejected": -228.0952911376953, + "loss": 0.0081, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.722094178199768, + "rewards/margins": 12.232990264892578, + "rewards/rejected": -13.955083847045898, + "step": 1200 + }, + { + "epoch": 1.82, + "learning_rate": 2.309721843923851e-08, + "logits/chosen": -1.0601420402526855, + "logits/rejected": -0.9911051392555237, + "logps/chosen": -74.96670532226562, + "logps/rejected": -190.51840209960938, + "loss": 0.0087, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.3472012281417847, + "rewards/margins": 11.10965347290039, + "rewards/rejected": -12.456853866577148, + "step": 1201 + }, + { + "epoch": 1.83, + "learning_rate": 2.2700328541804426e-08, + "logits/chosen": -1.3039923906326294, + "logits/rejected": -1.2911025285720825, + "logps/chosen": -108.84371185302734, + "logps/rejected": -235.702392578125, + "loss": 0.0121, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.061046838760376, + "rewards/margins": 12.482583999633789, + "rewards/rejected": -14.543630599975586, + "step": 1202 + }, + { + "epoch": 1.83, + "learning_rate": 2.2306799015317292e-08, + "logits/chosen": -1.1615723371505737, + "logits/rejected": -0.991241455078125, + "logps/chosen": -82.74966430664062, + "logps/rejected": -252.6170196533203, + "loss": 0.0095, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.834315299987793, + "rewards/margins": 15.210193634033203, + "rewards/rejected": -17.04450798034668, + "step": 1203 + }, + { + "epoch": 1.83, + "learning_rate": 2.1916632630374577e-08, + "logits/chosen": -1.174147129058838, + "logits/rejected": -1.1671512126922607, + "logps/chosen": -55.36254119873047, + "logps/rejected": -171.36447143554688, + "loss": 0.018, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.8767067193984985, + "rewards/margins": 10.042999267578125, + "rewards/rejected": -10.919706344604492, + "step": 1204 + }, + { + "epoch": 1.83, + "learning_rate": 2.1529832133895588e-08, + "logits/chosen": -1.0668604373931885, + "logits/rejected": -0.967597246170044, + "logps/chosen": -100.05206298828125, + "logps/rejected": -270.90191650390625, + "loss": 0.0144, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.6888511180877686, + "rewards/margins": 14.828164100646973, + "rewards/rejected": -17.51701545715332, + "step": 1205 + }, + { + "epoch": 1.83, + "learning_rate": 2.1146400249102802e-08, + "logits/chosen": -1.397310733795166, + "logits/rejected": -1.3512035608291626, + "logps/chosen": -67.1437759399414, + "logps/rejected": -159.02467346191406, + "loss": 0.0419, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.1888231039047241, + "rewards/margins": 9.070355415344238, + "rewards/rejected": -10.25917911529541, + "step": 1206 + }, + { + "epoch": 1.83, + "learning_rate": 2.0766339675502397e-08, + "logits/chosen": -1.0767494440078735, + "logits/rejected": -0.965934693813324, + "logps/chosen": -101.39511108398438, + "logps/rejected": -261.9713439941406, + "loss": 0.0056, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.6425881385803223, + "rewards/margins": 13.892492294311523, + "rewards/rejected": -17.535079956054688, + "step": 1207 + }, + { + "epoch": 1.84, + "learning_rate": 2.0389653088865033e-08, + "logits/chosen": -1.4239978790283203, + "logits/rejected": -1.3504027128219604, + "logps/chosen": -71.2942123413086, + "logps/rejected": -186.11898803710938, + "loss": 0.0058, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.407034158706665, + "rewards/margins": 10.300168991088867, + "rewards/rejected": -11.707202911376953, + "step": 1208 + }, + { + "epoch": 1.84, + "learning_rate": 2.001634314120726e-08, + "logits/chosen": -1.4119137525558472, + "logits/rejected": -1.307218074798584, + "logps/chosen": -72.17942810058594, + "logps/rejected": -205.65863037109375, + "loss": 0.0048, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.2082171440124512, + "rewards/margins": 12.206661224365234, + "rewards/rejected": -13.414878845214844, + "step": 1209 + }, + { + "epoch": 1.84, + "learning_rate": 1.964641246077303e-08, + "logits/chosen": -0.993287980556488, + "logits/rejected": -1.031794786453247, + "logps/chosen": -78.95590209960938, + "logps/rejected": -179.0350341796875, + "loss": 0.0055, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.017829179763794, + "rewards/margins": 9.196547508239746, + "rewards/rejected": -11.214376449584961, + "step": 1210 + }, + { + "epoch": 1.84, + "learning_rate": 1.9279863652014838e-08, + "logits/chosen": -1.060457468032837, + "logits/rejected": -1.096437692642212, + "logps/chosen": -122.54818725585938, + "logps/rejected": -243.62075805664062, + "loss": 0.0424, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.8781747817993164, + "rewards/margins": 12.498019218444824, + "rewards/rejected": -16.37619400024414, + "step": 1211 + }, + { + "epoch": 1.84, + "learning_rate": 1.8916699295575323e-08, + "logits/chosen": -1.059748649597168, + "logits/rejected": -1.059941053390503, + "logps/chosen": -54.84103775024414, + "logps/rejected": -156.09188842773438, + "loss": 0.0152, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.2279458045959473, + "rewards/margins": 8.983017921447754, + "rewards/rejected": -10.21096420288086, + "step": 1212 + }, + { + "epoch": 1.84, + "learning_rate": 1.8556921948269577e-08, + "logits/chosen": -1.0731769800186157, + "logits/rejected": -0.9469415545463562, + "logps/chosen": -90.43501281738281, + "logps/rejected": -218.74835205078125, + "loss": 0.0062, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.4426980018615723, + "rewards/margins": 11.901433944702148, + "rewards/rejected": -14.344131469726562, + "step": 1213 + }, + { + "epoch": 1.84, + "learning_rate": 1.820053414306677e-08, + "logits/chosen": -0.8140149116516113, + "logits/rejected": -0.7933439016342163, + "logps/chosen": -87.42171478271484, + "logps/rejected": -178.2332305908203, + "loss": 0.0122, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.8189470767974854, + "rewards/margins": 8.89945125579834, + "rewards/rejected": -11.718399047851562, + "step": 1214 + }, + { + "epoch": 1.85, + "learning_rate": 1.7847538389072435e-08, + "logits/chosen": -1.2803846597671509, + "logits/rejected": -1.211902379989624, + "logps/chosen": -79.93923950195312, + "logps/rejected": -209.45614624023438, + "loss": 0.0035, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.6933627128601074, + "rewards/margins": 11.596197128295898, + "rewards/rejected": -13.289560317993164, + "step": 1215 + }, + { + "epoch": 1.85, + "learning_rate": 1.7497937171510545e-08, + "logits/chosen": -1.1593124866485596, + "logits/rejected": -1.0734546184539795, + "logps/chosen": -98.91121673583984, + "logps/rejected": -225.10154724121094, + "loss": 0.0064, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.9870173931121826, + "rewards/margins": 11.160821914672852, + "rewards/rejected": -14.147839546203613, + "step": 1216 + }, + { + "epoch": 1.85, + "learning_rate": 1.715173295170669e-08, + "logits/chosen": -1.214421272277832, + "logits/rejected": -1.1262717247009277, + "logps/chosen": -64.54765319824219, + "logps/rejected": -186.80062866210938, + "loss": 0.0074, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.9676066040992737, + "rewards/margins": 11.020545959472656, + "rewards/rejected": -11.988153457641602, + "step": 1217 + }, + { + "epoch": 1.85, + "learning_rate": 1.6808928167069803e-08, + "logits/chosen": -1.2606310844421387, + "logits/rejected": -1.1958374977111816, + "logps/chosen": -61.314327239990234, + "logps/rejected": -165.88563537597656, + "loss": 0.0091, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.5954817533493042, + "rewards/margins": 10.101585388183594, + "rewards/rejected": -10.697067260742188, + "step": 1218 + }, + { + "epoch": 1.85, + "learning_rate": 1.6469525231075977e-08, + "logits/chosen": -1.272934913635254, + "logits/rejected": -1.4015471935272217, + "logps/chosen": -94.45033264160156, + "logps/rejected": -221.44044494628906, + "loss": 0.0118, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.8754397630691528, + "rewards/margins": 12.196402549743652, + "rewards/rejected": -14.071842193603516, + "step": 1219 + }, + { + "epoch": 1.85, + "learning_rate": 1.6133526533250563e-08, + "logits/chosen": -1.267065405845642, + "logits/rejected": -1.24986732006073, + "logps/chosen": -62.023494720458984, + "logps/rejected": -174.93882751464844, + "loss": 0.0032, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.1595714092254639, + "rewards/margins": 10.430471420288086, + "rewards/rejected": -11.590043067932129, + "step": 1220 + }, + { + "epoch": 1.85, + "learning_rate": 1.58009344391522e-08, + "logits/chosen": -1.1788753271102905, + "logits/rejected": -1.096854567527771, + "logps/chosen": -79.68408203125, + "logps/rejected": -203.6868133544922, + "loss": 0.006, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.2954856157302856, + "rewards/margins": 11.55772590637207, + "rewards/rejected": -12.85321044921875, + "step": 1221 + }, + { + "epoch": 1.86, + "learning_rate": 1.5471751290355385e-08, + "logits/chosen": -1.1234623193740845, + "logits/rejected": -1.024211049079895, + "logps/chosen": -49.120849609375, + "logps/rejected": -163.56591796875, + "loss": 0.0161, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.4070178270339966, + "rewards/margins": 10.147906303405762, + "rewards/rejected": -10.554924011230469, + "step": 1222 + }, + { + "epoch": 1.86, + "learning_rate": 1.5145979404434517e-08, + "logits/chosen": -1.3019061088562012, + "logits/rejected": -1.3134589195251465, + "logps/chosen": -94.47854614257812, + "logps/rejected": -204.00048828125, + "loss": 0.0121, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.767600417137146, + "rewards/margins": 11.218263626098633, + "rewards/rejected": -12.985862731933594, + "step": 1223 + }, + { + "epoch": 1.86, + "learning_rate": 1.4823621074947501e-08, + "logits/chosen": -1.1317193508148193, + "logits/rejected": -1.2033860683441162, + "logps/chosen": -64.93047332763672, + "logps/rejected": -133.66629028320312, + "loss": 0.0067, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.1771470308303833, + "rewards/margins": 7.702424049377441, + "rewards/rejected": -8.879571914672852, + "step": 1224 + }, + { + "epoch": 1.86, + "learning_rate": 1.4504678571419237e-08, + "logits/chosen": -1.3028910160064697, + "logits/rejected": -1.1780298948287964, + "logps/chosen": -78.07183837890625, + "logps/rejected": -232.07427978515625, + "loss": 0.0082, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.245678424835205, + "rewards/margins": 13.306718826293945, + "rewards/rejected": -14.552396774291992, + "step": 1225 + }, + { + "epoch": 1.86, + "learning_rate": 1.4189154139326143e-08, + "logits/chosen": -1.081373691558838, + "logits/rejected": -1.0935312509536743, + "logps/chosen": -79.95915222167969, + "logps/rejected": -178.8350067138672, + "loss": 0.0181, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.7669938802719116, + "rewards/margins": 9.89219856262207, + "rewards/rejected": -11.659192085266113, + "step": 1226 + }, + { + "epoch": 1.86, + "learning_rate": 1.3877050000080104e-08, + "logits/chosen": -1.1050331592559814, + "logits/rejected": -1.1011483669281006, + "logps/chosen": -65.14444732666016, + "logps/rejected": -165.34768676757812, + "loss": 0.0421, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.5117905139923096, + "rewards/margins": 9.55897331237793, + "rewards/rejected": -11.070762634277344, + "step": 1227 + }, + { + "epoch": 1.87, + "learning_rate": 1.3568368351012716e-08, + "logits/chosen": -1.2710386514663696, + "logits/rejected": -1.2004417181015015, + "logps/chosen": -72.86195373535156, + "logps/rejected": -189.37420654296875, + "loss": 0.0134, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.4070632457733154, + "rewards/margins": 9.87858772277832, + "rewards/rejected": -11.285650253295898, + "step": 1228 + }, + { + "epoch": 1.87, + "learning_rate": 1.3263111365360014e-08, + "logits/chosen": -1.1257786750793457, + "logits/rejected": -0.9959360361099243, + "logps/chosen": -74.35279846191406, + "logps/rejected": -210.28366088867188, + "loss": 0.0473, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.981217384338379, + "rewards/margins": 11.774518966674805, + "rewards/rejected": -13.7557373046875, + "step": 1229 + }, + { + "epoch": 1.87, + "learning_rate": 1.2961281192247097e-08, + "logits/chosen": -1.075447678565979, + "logits/rejected": -1.0008114576339722, + "logps/chosen": -87.16339111328125, + "logps/rejected": -248.26565551757812, + "loss": 0.0171, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.7712650299072266, + "rewards/margins": 14.019671440124512, + "rewards/rejected": -15.790936470031738, + "step": 1230 + }, + { + "epoch": 1.87, + "learning_rate": 1.2662879956673089e-08, + "logits/chosen": -1.1833305358886719, + "logits/rejected": -1.167616605758667, + "logps/chosen": -70.53716278076172, + "logps/rejected": -180.77481079101562, + "loss": 0.0068, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.8839368224143982, + "rewards/margins": 10.646862030029297, + "rewards/rejected": -11.530799865722656, + "step": 1231 + }, + { + "epoch": 1.87, + "learning_rate": 1.236790975949592e-08, + "logits/chosen": -1.132072925567627, + "logits/rejected": -1.056206226348877, + "logps/chosen": -59.7764778137207, + "logps/rejected": -148.61737060546875, + "loss": 0.0152, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.8665260672569275, + "rewards/margins": 7.744040489196777, + "rewards/rejected": -8.610567092895508, + "step": 1232 + }, + { + "epoch": 1.87, + "learning_rate": 1.2076372677417734e-08, + "logits/chosen": -1.2615550756454468, + "logits/rejected": -1.288482904434204, + "logps/chosen": -63.36968994140625, + "logps/rejected": -149.8604278564453, + "loss": 0.0044, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.965242862701416, + "rewards/margins": 8.132116317749023, + "rewards/rejected": -10.097360610961914, + "step": 1233 + }, + { + "epoch": 1.87, + "learning_rate": 1.1788270762970565e-08, + "logits/chosen": -1.0883687734603882, + "logits/rejected": -1.0756895542144775, + "logps/chosen": -84.82898712158203, + "logps/rejected": -194.68531799316406, + "loss": 0.0052, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.6581790447235107, + "rewards/margins": 10.328180313110352, + "rewards/rejected": -12.986358642578125, + "step": 1234 + }, + { + "epoch": 1.88, + "learning_rate": 1.1503606044500957e-08, + "logits/chosen": -1.0876470804214478, + "logits/rejected": -1.0359283685684204, + "logps/chosen": -93.50402069091797, + "logps/rejected": -222.7373809814453, + "loss": 0.0069, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.1938717365264893, + "rewards/margins": 12.126307487487793, + "rewards/rejected": -14.320178031921387, + "step": 1235 + }, + { + "epoch": 1.88, + "learning_rate": 1.1222380526156927e-08, + "logits/chosen": -1.1725844144821167, + "logits/rejected": -1.0043030977249146, + "logps/chosen": -95.12078094482422, + "logps/rejected": -236.8339385986328, + "loss": 0.0138, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.277890205383301, + "rewards/margins": 11.719527244567871, + "rewards/rejected": -14.997417449951172, + "step": 1236 + }, + { + "epoch": 1.88, + "learning_rate": 1.0944596187872745e-08, + "logits/chosen": -1.194044589996338, + "logits/rejected": -1.20572030544281, + "logps/chosen": -68.72064208984375, + "logps/rejected": -159.9511260986328, + "loss": 0.0152, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.520249843597412, + "rewards/margins": 8.89706802368164, + "rewards/rejected": -10.417318344116211, + "step": 1237 + }, + { + "epoch": 1.88, + "learning_rate": 1.0670254985355731e-08, + "logits/chosen": -1.1252317428588867, + "logits/rejected": -0.9940766096115112, + "logps/chosen": -79.25250244140625, + "logps/rejected": -223.510498046875, + "loss": 0.0051, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.9784678220748901, + "rewards/margins": 12.678990364074707, + "rewards/rejected": -14.65745735168457, + "step": 1238 + }, + { + "epoch": 1.88, + "learning_rate": 1.0399358850072038e-08, + "logits/chosen": -1.0572007894515991, + "logits/rejected": -0.9553701281547546, + "logps/chosen": -84.38623809814453, + "logps/rejected": -227.75350952148438, + "loss": 0.0252, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.6427552700042725, + "rewards/margins": 12.430475234985352, + "rewards/rejected": -15.073230743408203, + "step": 1239 + }, + { + "epoch": 1.88, + "learning_rate": 1.0131909689233442e-08, + "logits/chosen": -1.1975586414337158, + "logits/rejected": -1.1328387260437012, + "logps/chosen": -76.48176574707031, + "logps/rejected": -194.64166259765625, + "loss": 0.01, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.5488111972808838, + "rewards/margins": 10.741307258605957, + "rewards/rejected": -12.290119171142578, + "step": 1240 + }, + { + "epoch": 1.89, + "learning_rate": 9.86790938578358e-09, + "logits/chosen": -1.2658780813217163, + "logits/rejected": -1.2129406929016113, + "logps/chosen": -70.70858001708984, + "logps/rejected": -150.29287719726562, + "loss": 0.0194, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.334958553314209, + "rewards/margins": 7.42769718170166, + "rewards/rejected": -9.762656211853027, + "step": 1241 + }, + { + "epoch": 1.89, + "learning_rate": 9.607359798384784e-09, + "logits/chosen": -1.0291016101837158, + "logits/rejected": -0.9019633531570435, + "logps/chosen": -71.52269744873047, + "logps/rejected": -220.78086853027344, + "loss": 0.0087, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.1930785179138184, + "rewards/margins": 12.374126434326172, + "rewards/rejected": -13.567205429077148, + "step": 1242 + }, + { + "epoch": 1.89, + "learning_rate": 9.35026276140516e-09, + "logits/chosen": -1.429419755935669, + "logits/rejected": -1.4611481428146362, + "logps/chosen": -75.60625457763672, + "logps/rejected": -172.8060760498047, + "loss": 0.0051, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.9017477035522461, + "rewards/margins": 9.8096284866333, + "rewards/rejected": -10.711376190185547, + "step": 1243 + }, + { + "epoch": 1.89, + "learning_rate": 9.096620084905471e-09, + "logits/chosen": -1.2283917665481567, + "logits/rejected": -1.1737825870513916, + "logps/chosen": -65.57634735107422, + "logps/rejected": -171.305419921875, + "loss": 0.0071, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.3293113708496094, + "rewards/margins": 9.985025405883789, + "rewards/rejected": -11.314336776733398, + "step": 1244 + }, + { + "epoch": 1.89, + "learning_rate": 8.846433554626443e-09, + "logits/chosen": -1.1852015256881714, + "logits/rejected": -1.154024600982666, + "logps/chosen": -94.73068237304688, + "logps/rejected": -223.88473510742188, + "loss": 0.0217, + "rewards/accuracies": 0.9375, + "rewards/chosen": -3.0769922733306885, + "rewards/margins": 12.202123641967773, + "rewards/rejected": -15.279114723205566, + "step": 1245 + }, + { + "epoch": 1.89, + "learning_rate": 8.59970493197626e-09, + "logits/chosen": -1.3885767459869385, + "logits/rejected": -1.3077125549316406, + "logps/chosen": -95.191162109375, + "logps/rejected": -229.9364013671875, + "loss": 0.0045, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.3996024131774902, + "rewards/margins": 12.123512268066406, + "rewards/rejected": -13.523114204406738, + "step": 1246 + }, + { + "epoch": 1.89, + "learning_rate": 8.356435954018193e-09, + "logits/chosen": -1.3719979524612427, + "logits/rejected": -1.3080120086669922, + "logps/chosen": -69.03075408935547, + "logps/rejected": -218.9082489013672, + "loss": 0.0044, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.759949803352356, + "rewards/margins": 13.311388969421387, + "rewards/rejected": -14.071337699890137, + "step": 1247 + }, + { + "epoch": 1.9, + "learning_rate": 8.11662833345822e-09, + "logits/chosen": -0.9629684090614319, + "logits/rejected": -0.8659667372703552, + "logps/chosen": -73.18289184570312, + "logps/rejected": -200.80935668945312, + "loss": 0.0223, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.4222288131713867, + "rewards/margins": 10.841693878173828, + "rewards/rejected": -12.263922691345215, + "step": 1248 + }, + { + "epoch": 1.9, + "learning_rate": 7.880283758633033e-09, + "logits/chosen": -1.3673051595687866, + "logits/rejected": -1.3786245584487915, + "logps/chosen": -103.65737915039062, + "logps/rejected": -298.3373718261719, + "loss": 0.0074, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.8675320148468018, + "rewards/margins": 17.153118133544922, + "rewards/rejected": -19.020648956298828, + "step": 1249 + }, + { + "epoch": 1.9, + "learning_rate": 7.647403893498106e-09, + "logits/chosen": -1.2449382543563843, + "logits/rejected": -1.1799533367156982, + "logps/chosen": -80.3458023071289, + "logps/rejected": -179.2467803955078, + "loss": 0.0144, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.0060768127441406, + "rewards/margins": 9.305712699890137, + "rewards/rejected": -12.311788558959961, + "step": 1250 + }, + { + "epoch": 1.9, + "learning_rate": 7.417990377616312e-09, + "logits/chosen": -1.165990948677063, + "logits/rejected": -1.119705080986023, + "logps/chosen": -95.58226013183594, + "logps/rejected": -220.37850952148438, + "loss": 0.016, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.774306535720825, + "rewards/margins": 11.19355583190918, + "rewards/rejected": -13.967863082885742, + "step": 1251 + }, + { + "epoch": 1.9, + "learning_rate": 7.1920448261457715e-09, + "logits/chosen": -1.0052763223648071, + "logits/rejected": -0.9632813334465027, + "logps/chosen": -73.35610961914062, + "logps/rejected": -190.4112091064453, + "loss": 0.0136, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.2970337867736816, + "rewards/margins": 10.754767417907715, + "rewards/rejected": -12.051801681518555, + "step": 1252 + }, + { + "epoch": 1.9, + "learning_rate": 6.9695688298290754e-09, + "logits/chosen": -1.1739317178726196, + "logits/rejected": -1.069629430770874, + "logps/chosen": -55.94324493408203, + "logps/rejected": -171.08547973632812, + "loss": 0.0156, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.967921793460846, + "rewards/margins": 10.362225532531738, + "rewards/rejected": -11.330146789550781, + "step": 1253 + }, + { + "epoch": 1.91, + "learning_rate": 6.750563954981636e-09, + "logits/chosen": -1.1875897645950317, + "logits/rejected": -1.1473288536071777, + "logps/chosen": -61.80609130859375, + "logps/rejected": -234.46588134765625, + "loss": 0.003, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.11906401813030243, + "rewards/margins": 15.78310489654541, + "rewards/rejected": -15.902167320251465, + "step": 1254 + }, + { + "epoch": 1.91, + "learning_rate": 6.535031743480968e-09, + "logits/chosen": -0.9607847332954407, + "logits/rejected": -0.8286802172660828, + "logps/chosen": -81.98706817626953, + "logps/rejected": -215.73744201660156, + "loss": 0.0088, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.6667954921722412, + "rewards/margins": 11.247720718383789, + "rewards/rejected": -12.91451644897461, + "step": 1255 + }, + { + "epoch": 1.91, + "learning_rate": 6.322973712755697e-09, + "logits/chosen": -1.1424051523208618, + "logits/rejected": -1.085515022277832, + "logps/chosen": -55.21360778808594, + "logps/rejected": -161.61468505859375, + "loss": 0.0143, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.26040124893188477, + "rewards/margins": 10.282100677490234, + "rewards/rejected": -10.542502403259277, + "step": 1256 + }, + { + "epoch": 1.91, + "learning_rate": 6.1143913557749615e-09, + "logits/chosen": -1.2211065292358398, + "logits/rejected": -1.0838934183120728, + "logps/chosen": -73.61480712890625, + "logps/rejected": -193.85784912109375, + "loss": 0.0033, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.3676726818084717, + "rewards/margins": 10.925106048583984, + "rewards/rejected": -12.292778968811035, + "step": 1257 + }, + { + "epoch": 1.91, + "learning_rate": 5.9092861410376945e-09, + "logits/chosen": -1.2819068431854248, + "logits/rejected": -1.2782381772994995, + "logps/chosen": -71.67346954345703, + "logps/rejected": -198.6710662841797, + "loss": 0.0074, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.2299200296401978, + "rewards/margins": 11.510444641113281, + "rewards/rejected": -12.740364074707031, + "step": 1258 + }, + { + "epoch": 1.91, + "learning_rate": 5.7076595125624684e-09, + "logits/chosen": -1.0552005767822266, + "logits/rejected": -1.138450026512146, + "logps/chosen": -73.14871215820312, + "logps/rejected": -144.5664825439453, + "loss": 0.0065, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.1182124614715576, + "rewards/margins": 7.121225357055664, + "rewards/rejected": -9.239437103271484, + "step": 1259 + }, + { + "epoch": 1.91, + "learning_rate": 5.509512889877332e-09, + "logits/chosen": -1.1288034915924072, + "logits/rejected": -1.0820059776306152, + "logps/chosen": -87.96611022949219, + "logps/rejected": -205.0126953125, + "loss": 0.005, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.589266300201416, + "rewards/margins": 10.964679718017578, + "rewards/rejected": -12.553946495056152, + "step": 1260 + }, + { + "epoch": 1.92, + "learning_rate": 5.314847668009881e-09, + "logits/chosen": -1.153718113899231, + "logits/rejected": -1.0785784721374512, + "logps/chosen": -76.9947280883789, + "logps/rejected": -193.70933532714844, + "loss": 0.0078, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.0610992908477783, + "rewards/margins": 10.301348686218262, + "rewards/rejected": -12.362447738647461, + "step": 1261 + }, + { + "epoch": 1.92, + "learning_rate": 5.123665217477036e-09, + "logits/chosen": -1.086852788925171, + "logits/rejected": -0.9465436935424805, + "logps/chosen": -58.998016357421875, + "logps/rejected": -206.93099975585938, + "loss": 0.006, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.247649073600769, + "rewards/margins": 12.062419891357422, + "rewards/rejected": -13.310070037841797, + "step": 1262 + }, + { + "epoch": 1.92, + "learning_rate": 4.935966884275944e-09, + "logits/chosen": -0.9989705681800842, + "logits/rejected": -1.0006415843963623, + "logps/chosen": -40.644771575927734, + "logps/rejected": -139.12960815429688, + "loss": 0.0084, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.12693235278129578, + "rewards/margins": 9.171871185302734, + "rewards/rejected": -9.29880428314209, + "step": 1263 + }, + { + "epoch": 1.92, + "learning_rate": 4.751753989874152e-09, + "logits/chosen": -1.135905146598816, + "logits/rejected": -1.105804681777954, + "logps/chosen": -83.68693542480469, + "logps/rejected": -197.93209838867188, + "loss": 0.042, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.8465352058410645, + "rewards/margins": 10.878917694091797, + "rewards/rejected": -12.725454330444336, + "step": 1264 + }, + { + "epoch": 1.92, + "learning_rate": 4.571027831200336e-09, + "logits/chosen": -1.1909990310668945, + "logits/rejected": -1.2340352535247803, + "logps/chosen": -69.18940734863281, + "logps/rejected": -165.2108917236328, + "loss": 0.0303, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.2262462377548218, + "rewards/margins": 9.280619621276855, + "rewards/rejected": -10.506864547729492, + "step": 1265 + }, + { + "epoch": 1.92, + "learning_rate": 4.393789680635307e-09, + "logits/chosen": -1.2873908281326294, + "logits/rejected": -1.2333879470825195, + "logps/chosen": -108.88398742675781, + "logps/rejected": -268.0072326660156, + "loss": 0.0239, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.34629225730896, + "rewards/margins": 13.214006423950195, + "rewards/rejected": -16.560298919677734, + "step": 1266 + }, + { + "epoch": 1.92, + "learning_rate": 4.2200407860029655e-09, + "logits/chosen": -1.0964261293411255, + "logits/rejected": -0.9515805244445801, + "logps/chosen": -83.07080078125, + "logps/rejected": -202.83042907714844, + "loss": 0.0078, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.1373143196105957, + "rewards/margins": 10.945977210998535, + "rewards/rejected": -13.083292007446289, + "step": 1267 + }, + { + "epoch": 1.93, + "learning_rate": 4.049782370561583e-09, + "logits/chosen": -1.051609992980957, + "logits/rejected": -0.8953971266746521, + "logps/chosen": -67.84369659423828, + "logps/rejected": -204.5872039794922, + "loss": 0.0154, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.2447519302368164, + "rewards/margins": 11.80958366394043, + "rewards/rejected": -13.054337501525879, + "step": 1268 + }, + { + "epoch": 1.93, + "learning_rate": 3.8830156329949235e-09, + "logits/chosen": -0.9912279844284058, + "logits/rejected": -0.9690757393836975, + "logps/chosen": -71.34811401367188, + "logps/rejected": -150.02572631835938, + "loss": 0.0321, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.4374864101409912, + "rewards/margins": 8.00107192993164, + "rewards/rejected": -9.438558578491211, + "step": 1269 + }, + { + "epoch": 1.93, + "learning_rate": 3.719741747404248e-09, + "logits/chosen": -1.1927863359451294, + "logits/rejected": -1.2303924560546875, + "logps/chosen": -59.63166809082031, + "logps/rejected": -160.16390991210938, + "loss": 0.0212, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.2459349632263184, + "rewards/margins": 9.462393760681152, + "rewards/rejected": -10.708330154418945, + "step": 1270 + }, + { + "epoch": 1.93, + "learning_rate": 3.5599618632997097e-09, + "logits/chosen": -1.103395938873291, + "logits/rejected": -1.0654138326644897, + "logps/chosen": -76.19361114501953, + "logps/rejected": -192.7483367919922, + "loss": 0.007, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.0147182941436768, + "rewards/margins": 11.459994316101074, + "rewards/rejected": -12.474712371826172, + "step": 1271 + }, + { + "epoch": 1.93, + "learning_rate": 3.403677105592306e-09, + "logits/chosen": -1.1428433656692505, + "logits/rejected": -1.1347157955169678, + "logps/chosen": -62.77545166015625, + "logps/rejected": -167.58773803710938, + "loss": 0.0064, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.135177731513977, + "rewards/margins": 10.042524337768555, + "rewards/rejected": -11.177702903747559, + "step": 1272 + }, + { + "epoch": 1.93, + "learning_rate": 3.2508885745861637e-09, + "logits/chosen": -1.2463092803955078, + "logits/rejected": -1.1767163276672363, + "logps/chosen": -87.07160949707031, + "logps/rejected": -209.8955078125, + "loss": 0.0137, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.458374500274658, + "rewards/margins": 10.535346984863281, + "rewards/rejected": -12.993721961975098, + "step": 1273 + }, + { + "epoch": 1.94, + "learning_rate": 3.1015973459704857e-09, + "logits/chosen": -1.2347352504730225, + "logits/rejected": -1.1266793012619019, + "logps/chosen": -109.47550964355469, + "logps/rejected": -266.93524169921875, + "loss": 0.0089, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.782963752746582, + "rewards/margins": 13.678717613220215, + "rewards/rejected": -17.461681365966797, + "step": 1274 + }, + { + "epoch": 1.94, + "learning_rate": 2.9558044708123397e-09, + "logits/chosen": -0.8758256435394287, + "logits/rejected": -0.75980544090271, + "logps/chosen": -43.183082580566406, + "logps/rejected": -126.92056274414062, + "loss": 0.0064, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.26970601081848145, + "rewards/margins": 7.577888488769531, + "rewards/rejected": -7.847594738006592, + "step": 1275 + }, + { + "epoch": 1.94, + "learning_rate": 2.813510975548772e-09, + "logits/chosen": -1.201333999633789, + "logits/rejected": -1.1146811246871948, + "logps/chosen": -92.7200698852539, + "logps/rejected": -198.17112731933594, + "loss": 0.0117, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.748230457305908, + "rewards/margins": 9.353466033935547, + "rewards/rejected": -12.101696968078613, + "step": 1276 + }, + { + "epoch": 1.94, + "learning_rate": 2.6747178619800913e-09, + "logits/chosen": -1.3739142417907715, + "logits/rejected": -1.3986132144927979, + "logps/chosen": -59.81341552734375, + "logps/rejected": -179.90093994140625, + "loss": 0.0083, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.376999855041504, + "rewards/margins": 10.636183738708496, + "rewards/rejected": -12.01318359375, + "step": 1277 + }, + { + "epoch": 1.94, + "learning_rate": 2.539426107262599e-09, + "logits/chosen": -1.2071001529693604, + "logits/rejected": -1.1412181854248047, + "logps/chosen": -110.31078338623047, + "logps/rejected": -285.4696044921875, + "loss": 0.0048, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.4827301502227783, + "rewards/margins": 16.43672752380371, + "rewards/rejected": -18.919456481933594, + "step": 1278 + }, + { + "epoch": 1.94, + "learning_rate": 2.407636663901591e-09, + "logits/chosen": -1.0972704887390137, + "logits/rejected": -0.9204725623130798, + "logps/chosen": -72.80489349365234, + "logps/rejected": -211.97796630859375, + "loss": 0.0136, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.093588948249817, + "rewards/margins": 12.024868965148926, + "rewards/rejected": -13.11845874786377, + "step": 1279 + }, + { + "epoch": 1.94, + "learning_rate": 2.2793504597446998e-09, + "logits/chosen": -1.2425183057785034, + "logits/rejected": -1.156579613685608, + "logps/chosen": -91.27490234375, + "logps/rejected": -224.80088806152344, + "loss": 0.0045, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.0254929065704346, + "rewards/margins": 12.998912811279297, + "rewards/rejected": -15.024404525756836, + "step": 1280 + }, + { + "epoch": 1.95, + "learning_rate": 2.15456839797562e-09, + "logits/chosen": -1.0614230632781982, + "logits/rejected": -0.9386066794395447, + "logps/chosen": -96.80696105957031, + "logps/rejected": -256.17938232421875, + "loss": 0.0144, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.9327540397644043, + "rewards/margins": 13.764790534973145, + "rewards/rejected": -16.69754409790039, + "step": 1281 + }, + { + "epoch": 1.95, + "learning_rate": 2.0332913571074474e-09, + "logits/chosen": -1.1535450220108032, + "logits/rejected": -1.2190803289413452, + "logps/chosen": -85.00349426269531, + "logps/rejected": -179.6030731201172, + "loss": 0.0466, + "rewards/accuracies": 0.9375, + "rewards/chosen": -2.5289652347564697, + "rewards/margins": 9.103918075561523, + "rewards/rejected": -11.63288402557373, + "step": 1282 + }, + { + "epoch": 1.95, + "learning_rate": 1.9155201909765717e-09, + "logits/chosen": -0.8580641150474548, + "logits/rejected": -0.7709234952926636, + "logps/chosen": -69.90385437011719, + "logps/rejected": -167.78280639648438, + "loss": 0.0097, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.7532331943511963, + "rewards/margins": 9.278640747070312, + "rewards/rejected": -11.031872749328613, + "step": 1283 + }, + { + "epoch": 1.95, + "learning_rate": 1.8012557287367391e-09, + "logits/chosen": -1.0224920511245728, + "logits/rejected": -1.008731484413147, + "logps/chosen": -79.84169006347656, + "logps/rejected": -211.30001831054688, + "loss": 0.008, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.5303840637207031, + "rewards/margins": 12.468156814575195, + "rewards/rejected": -13.998542785644531, + "step": 1284 + }, + { + "epoch": 1.95, + "learning_rate": 1.6904987748532217e-09, + "logits/chosen": -1.0827257633209229, + "logits/rejected": -0.9155064225196838, + "logps/chosen": -97.63224029541016, + "logps/rejected": -269.2901306152344, + "loss": 0.0331, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.41310453414917, + "rewards/margins": 15.162589073181152, + "rewards/rejected": -17.575693130493164, + "step": 1285 + }, + { + "epoch": 1.95, + "learning_rate": 1.5832501090968786e-09, + "logits/chosen": -0.9661201238632202, + "logits/rejected": -0.7906744480133057, + "logps/chosen": -66.0601806640625, + "logps/rejected": -214.29466247558594, + "loss": 0.005, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.5611439943313599, + "rewards/margins": 12.344537734985352, + "rewards/rejected": -13.905682563781738, + "step": 1286 + }, + { + "epoch": 1.96, + "learning_rate": 1.479510486539215e-09, + "logits/chosen": -0.9654014110565186, + "logits/rejected": -0.713676929473877, + "logps/chosen": -72.038818359375, + "logps/rejected": -260.5837097167969, + "loss": 0.0104, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.1215585470199585, + "rewards/margins": 14.717130661010742, + "rewards/rejected": -15.838688850402832, + "step": 1287 + }, + { + "epoch": 1.96, + "learning_rate": 1.3792806375464427e-09, + "logits/chosen": -1.288902759552002, + "logits/rejected": -1.244452953338623, + "logps/chosen": -76.58100891113281, + "logps/rejected": -184.2479248046875, + "loss": 0.0112, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.9578903913497925, + "rewards/margins": 9.693171501159668, + "rewards/rejected": -11.65106201171875, + "step": 1288 + }, + { + "epoch": 1.96, + "learning_rate": 1.2825612677748732e-09, + "logits/chosen": -1.3379647731781006, + "logits/rejected": -1.2519395351409912, + "logps/chosen": -67.4783706665039, + "logps/rejected": -164.44052124023438, + "loss": 0.0037, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.609692394733429, + "rewards/margins": 9.941685676574707, + "rewards/rejected": -10.55137825012207, + "step": 1289 + }, + { + "epoch": 1.96, + "learning_rate": 1.1893530581654766e-09, + "logits/chosen": -1.2620232105255127, + "logits/rejected": -1.195061445236206, + "logps/chosen": -78.19368743896484, + "logps/rejected": -195.7747344970703, + "loss": 0.0094, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.8783726692199707, + "rewards/margins": 9.804338455200195, + "rewards/rejected": -11.682710647583008, + "step": 1290 + }, + { + "epoch": 1.96, + "learning_rate": 1.0996566649395522e-09, + "logits/chosen": -1.1842918395996094, + "logits/rejected": -1.0768086910247803, + "logps/chosen": -81.30087280273438, + "logps/rejected": -228.88381958007812, + "loss": 0.0119, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.9906009435653687, + "rewards/margins": 12.947613716125488, + "rewards/rejected": -14.938216209411621, + "step": 1291 + }, + { + "epoch": 1.96, + "learning_rate": 1.013472719593733e-09, + "logits/chosen": -1.0303940773010254, + "logits/rejected": -0.9588882923126221, + "logps/chosen": -61.77070617675781, + "logps/rejected": -165.2044677734375, + "loss": 0.011, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.1046134233474731, + "rewards/margins": 9.36551570892334, + "rewards/rejected": -10.470128059387207, + "step": 1292 + }, + { + "epoch": 1.96, + "learning_rate": 9.308018288957109e-10, + "logits/chosen": -1.2706711292266846, + "logits/rejected": -1.1618214845657349, + "logps/chosen": -98.66361999511719, + "logps/rejected": -245.0448760986328, + "loss": 0.0275, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.179671287536621, + "rewards/margins": 12.922852516174316, + "rewards/rejected": -15.102523803710938, + "step": 1293 + }, + { + "epoch": 1.97, + "learning_rate": 8.516445748800172e-10, + "logits/chosen": -1.0659732818603516, + "logits/rejected": -0.9968633651733398, + "logps/chosen": -99.51051330566406, + "logps/rejected": -270.474853515625, + "loss": 0.022, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.3661701679229736, + "rewards/margins": 14.43785285949707, + "rewards/rejected": -16.80402374267578, + "step": 1294 + }, + { + "epoch": 1.97, + "learning_rate": 7.760015148436938e-10, + "logits/chosen": -1.1200745105743408, + "logits/rejected": -1.0644164085388184, + "logps/chosen": -74.07913208007812, + "logps/rejected": -169.44464111328125, + "loss": 0.0046, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.399211883544922, + "rewards/margins": 8.491737365722656, + "rewards/rejected": -10.890949249267578, + "step": 1295 + }, + { + "epoch": 1.97, + "learning_rate": 7.038731813426291e-10, + "logits/chosen": -0.9871138334274292, + "logits/rejected": -0.9330585598945618, + "logps/chosen": -66.69938659667969, + "logps/rejected": -158.9303436279297, + "loss": 0.0484, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.036125659942627, + "rewards/margins": 8.828483581542969, + "rewards/rejected": -9.864608764648438, + "step": 1296 + }, + { + "epoch": 1.97, + "learning_rate": 6.352600821876719e-10, + "logits/chosen": -1.2522269487380981, + "logits/rejected": -1.1094790697097778, + "logps/chosen": -101.69255065917969, + "logps/rejected": -275.8647766113281, + "loss": 0.0189, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.7666094303131104, + "rewards/margins": 15.001011848449707, + "rewards/rejected": -17.767621994018555, + "step": 1297 + }, + { + "epoch": 1.97, + "learning_rate": 5.701627004411347e-10, + "logits/chosen": -0.8893463611602783, + "logits/rejected": -0.7074088454246521, + "logps/chosen": -83.16688537597656, + "logps/rejected": -230.15130615234375, + "loss": 0.0058, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.377328872680664, + "rewards/margins": 12.509635925292969, + "rewards/rejected": -14.886964797973633, + "step": 1298 + }, + { + "epoch": 1.97, + "learning_rate": 5.08581494413296e-10, + "logits/chosen": -1.224255084991455, + "logits/rejected": -1.1592600345611572, + "logps/chosen": -103.52490234375, + "logps/rejected": -236.6667022705078, + "loss": 0.0042, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.5443618297576904, + "rewards/margins": 12.799535751342773, + "rewards/rejected": -15.34389591217041, + "step": 1299 + }, + { + "epoch": 1.97, + "learning_rate": 4.5051689765929213e-10, + "logits/chosen": -1.092574119567871, + "logits/rejected": -0.9000959396362305, + "logps/chosen": -79.19598388671875, + "logps/rejected": -217.1548309326172, + "loss": 0.0105, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.9727332592010498, + "rewards/margins": 12.164961814880371, + "rewards/rejected": -14.1376953125, + "step": 1300 + }, + { + "epoch": 1.98, + "learning_rate": 3.959693189757862e-10, + "logits/chosen": -1.3492411375045776, + "logits/rejected": -1.288551926612854, + "logps/chosen": -71.91275024414062, + "logps/rejected": -187.95205688476562, + "loss": 0.0064, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.8926293849945068, + "rewards/margins": 11.140167236328125, + "rewards/rejected": -12.032794952392578, + "step": 1301 + }, + { + "epoch": 1.98, + "learning_rate": 3.4493914239858146e-10, + "logits/chosen": -1.2096384763717651, + "logits/rejected": -1.1264015436172485, + "logps/chosen": -84.34400939941406, + "logps/rejected": -215.83203125, + "loss": 0.018, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.3935375213623047, + "rewards/margins": 11.227790832519531, + "rewards/rejected": -13.621328353881836, + "step": 1302 + }, + { + "epoch": 1.98, + "learning_rate": 2.974267271994013e-10, + "logits/chosen": -1.2268598079681396, + "logits/rejected": -1.1861000061035156, + "logps/chosen": -81.8856430053711, + "logps/rejected": -210.980712890625, + "loss": 0.0058, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.220350980758667, + "rewards/margins": 12.413765907287598, + "rewards/rejected": -13.63411808013916, + "step": 1303 + }, + { + "epoch": 1.98, + "learning_rate": 2.5343240788378016e-10, + "logits/chosen": -1.2381658554077148, + "logits/rejected": -1.0283713340759277, + "logps/chosen": -114.27998352050781, + "logps/rejected": -293.5376892089844, + "loss": 0.0054, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.651200771331787, + "rewards/margins": 15.330041885375977, + "rewards/rejected": -18.98124122619629, + "step": 1304 + }, + { + "epoch": 1.98, + "learning_rate": 2.1295649418839877e-10, + "logits/chosen": -1.2112208604812622, + "logits/rejected": -1.0418407917022705, + "logps/chosen": -97.28136444091797, + "logps/rejected": -230.7408447265625, + "loss": 0.0086, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.5646955966949463, + "rewards/margins": 12.14634895324707, + "rewards/rejected": -14.711043357849121, + "step": 1305 + }, + { + "epoch": 1.98, + "learning_rate": 1.7599927107908585e-10, + "logits/chosen": -1.0789711475372314, + "logits/rejected": -0.9465877413749695, + "logps/chosen": -71.02899932861328, + "logps/rejected": -193.57167053222656, + "loss": 0.0054, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.4242923259735107, + "rewards/margins": 10.871113777160645, + "rewards/rejected": -12.295406341552734, + "step": 1306 + }, + { + "epoch": 1.99, + "learning_rate": 1.4256099874865312e-10, + "logits/chosen": -1.2479493618011475, + "logits/rejected": -1.1449650526046753, + "logps/chosen": -92.4945297241211, + "logps/rejected": -238.48211669921875, + "loss": 0.0033, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.360962390899658, + "rewards/margins": 12.930143356323242, + "rewards/rejected": -15.291106224060059, + "step": 1307 + }, + { + "epoch": 1.99, + "learning_rate": 1.1264191261528555e-10, + "logits/chosen": -0.8584355115890503, + "logits/rejected": -0.8667644262313843, + "logps/chosen": -54.118743896484375, + "logps/rejected": -140.81704711914062, + "loss": 0.0138, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.6013064384460449, + "rewards/margins": 8.267008781433105, + "rewards/rejected": -8.868314743041992, + "step": 1308 + }, + { + "epoch": 1.99, + "learning_rate": 8.62422233207094e-11, + "logits/chosen": -1.1232349872589111, + "logits/rejected": -0.9467052221298218, + "logps/chosen": -80.04724884033203, + "logps/rejected": -226.53021240234375, + "loss": 0.0289, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.100845217704773, + "rewards/margins": 12.923338890075684, + "rewards/rejected": -14.024184226989746, + "step": 1309 + }, + { + "epoch": 1.99, + "learning_rate": 6.336211672880453e-11, + "logits/chosen": -1.2444984912872314, + "logits/rejected": -1.2211215496063232, + "logps/chosen": -63.22529983520508, + "logps/rejected": -168.42591857910156, + "loss": 0.0062, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.8757107257843018, + "rewards/margins": 9.665054321289062, + "rewards/rejected": -10.540765762329102, + "step": 1310 + }, + { + "epoch": 1.99, + "learning_rate": 4.4001753924105547e-11, + "logits/chosen": -1.2597283124923706, + "logits/rejected": -1.2255922555923462, + "logps/chosen": -90.53179931640625, + "logps/rejected": -219.01829528808594, + "loss": 0.0168, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.364285707473755, + "rewards/margins": 10.397770881652832, + "rewards/rejected": -13.762057304382324, + "step": 1311 + }, + { + "epoch": 1.99, + "learning_rate": 2.8161271211024628e-11, + "logits/chosen": -1.2949028015136719, + "logits/rejected": -1.2165987491607666, + "logps/chosen": -69.34394836425781, + "logps/rejected": -204.77670288085938, + "loss": 0.0151, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.8771134614944458, + "rewards/margins": 11.10797119140625, + "rewards/rejected": -12.985086441040039, + "step": 1312 + }, + { + "epoch": 1.99, + "learning_rate": 1.5840780112519325e-11, + "logits/chosen": -1.2168735265731812, + "logits/rejected": -1.1569671630859375, + "logps/chosen": -73.16233825683594, + "logps/rejected": -197.2447052001953, + "loss": 0.0106, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.596511960029602, + "rewards/margins": 11.405481338500977, + "rewards/rejected": -13.001994132995605, + "step": 1313 + }, + { + "epoch": 2.0, + "learning_rate": 7.0403673695373925e-12, + "logits/chosen": -1.0676594972610474, + "logits/rejected": -1.0000176429748535, + "logps/chosen": -55.793033599853516, + "logps/rejected": -137.31324768066406, + "loss": 0.0108, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.1835592985153198, + "rewards/margins": 7.81547212600708, + "rewards/rejected": -8.999032020568848, + "step": 1314 + }, + { + "epoch": 2.0, + "learning_rate": 1.7600949402951647e-12, + "logits/chosen": -1.2632269859313965, + "logits/rejected": -1.2024056911468506, + "logps/chosen": -77.23323059082031, + "logps/rejected": -208.28042602539062, + "loss": 0.0195, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.5779087543487549, + "rewards/margins": 12.250576972961426, + "rewards/rejected": -13.828487396240234, + "step": 1315 + }, + { + "epoch": 2.0, + "learning_rate": 0.0, + "logits/chosen": -0.9433375000953674, + "logits/rejected": -0.918114960193634, + "logps/chosen": -55.77727508544922, + "logps/rejected": -143.7599334716797, + "loss": 0.0165, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.816506028175354, + "rewards/margins": 8.368966102600098, + "rewards/rejected": -9.185473442077637, + "step": 1316 + }, + { + "epoch": 2.0, + "step": 1316, + "total_flos": 0.0, + "train_loss": 0.154733782048524, + "train_runtime": 54790.8843, + "train_samples_per_second": 3.076, + "train_steps_per_second": 0.024 + } + ], + "logging_steps": 1.0, + "max_steps": 1316, + "num_input_tokens_seen": 0, + "num_train_epochs": 2, + "save_steps": 500, + "total_flos": 0.0, + "train_batch_size": 2, + "trial_name": null, + "trial_params": null +}