diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,26860 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 3.0, + "eval_steps": 400, + "global_step": 17412, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.00017229496898690558, + "grad_norm": 2.0905284881591797, + "learning_rate": 5.74052812858783e-11, + "logits/chosen": -2.8080272674560547, + "logits/rejected": -2.785019874572754, + "logps/chosen": -44.8405876159668, + "logps/rejected": -39.36625671386719, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 1 + }, + { + "epoch": 0.0017229496898690559, + "grad_norm": 2.09809947013855, + "learning_rate": 5.74052812858783e-10, + "logits/chosen": -2.9044275283813477, + "logits/rejected": -2.8818445205688477, + "logps/chosen": -51.817386627197266, + "logps/rejected": -49.23894119262695, + "loss": 0.6932, + "rewards/accuracies": 0.4513888955116272, + "rewards/chosen": -9.387239697389305e-05, + "rewards/margins": -6.934934935998172e-05, + "rewards/rejected": -2.4523074898752384e-05, + "step": 10 + }, + { + "epoch": 0.0034458993797381117, + "grad_norm": 2.099186658859253, + "learning_rate": 1.148105625717566e-09, + "logits/chosen": -2.9467902183532715, + "logits/rejected": -2.941981077194214, + "logps/chosen": -53.83275604248047, + "logps/rejected": -52.87550735473633, + "loss": 0.6932, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.00010858189489226788, + "rewards/margins": -0.00010842746996786445, + "rewards/rejected": -1.5443511358625983e-07, + "step": 20 + }, + { + "epoch": 0.005168849069607168, + "grad_norm": 2.2349655628204346, + "learning_rate": 1.7221584385763488e-09, + "logits/chosen": -2.910006046295166, + "logits/rejected": -2.891695261001587, + "logps/chosen": -57.67896270751953, + "logps/rejected": -57.83086395263672, + "loss": 0.6932, + "rewards/accuracies": 0.4625000059604645, + "rewards/chosen": 4.440903921931749e-06, + "rewards/margins": -4.769151928485371e-05, + "rewards/rejected": 5.213242911850102e-05, + "step": 30 + }, + { + "epoch": 0.006891798759476223, + "grad_norm": 1.8455334901809692, + "learning_rate": 2.296211251435132e-09, + "logits/chosen": -2.9271020889282227, + "logits/rejected": -2.9034230709075928, + "logps/chosen": -56.056373596191406, + "logps/rejected": -50.16437530517578, + "loss": 0.693, + "rewards/accuracies": 0.5375000238418579, + "rewards/chosen": 0.00015602688654325902, + "rewards/margins": 0.00022045333753339946, + "rewards/rejected": -6.442649464588612e-05, + "step": 40 + }, + { + "epoch": 0.00861474844934528, + "grad_norm": 1.9800422191619873, + "learning_rate": 2.870264064293915e-09, + "logits/chosen": -2.9295787811279297, + "logits/rejected": -2.918616771697998, + "logps/chosen": -53.17473220825195, + "logps/rejected": -50.476890563964844, + "loss": 0.6931, + "rewards/accuracies": 0.5375000238418579, + "rewards/chosen": -6.461610610131174e-05, + "rewards/margins": -8.848131756167277e-07, + "rewards/rejected": -6.373132055159658e-05, + "step": 50 + }, + { + "epoch": 0.010337698139214336, + "grad_norm": 2.3492772579193115, + "learning_rate": 3.4443168771526976e-09, + "logits/chosen": -2.9493114948272705, + "logits/rejected": -2.926198720932007, + "logps/chosen": -58.42417526245117, + "logps/rejected": -53.90165328979492, + "loss": 0.6933, + "rewards/accuracies": 0.4437499940395355, + "rewards/chosen": -0.00026348684332333505, + "rewards/margins": -0.00033530019572936, + "rewards/rejected": 7.18133378541097e-05, + "step": 60 + }, + { + "epoch": 0.012060647829083391, + "grad_norm": 2.0364973545074463, + "learning_rate": 4.018369690011481e-09, + "logits/chosen": -2.9074623584747314, + "logits/rejected": -2.895207166671753, + "logps/chosen": -54.81498336791992, + "logps/rejected": -52.3970832824707, + "loss": 0.6931, + "rewards/accuracies": 0.5562499761581421, + "rewards/chosen": 0.00012572194100357592, + "rewards/margins": 7.510065915994346e-05, + "rewards/rejected": 5.062127092969604e-05, + "step": 70 + }, + { + "epoch": 0.013783597518952447, + "grad_norm": 2.25162672996521, + "learning_rate": 4.592422502870264e-09, + "logits/chosen": -2.9643242359161377, + "logits/rejected": -2.942950487136841, + "logps/chosen": -60.2001953125, + "logps/rejected": -53.245811462402344, + "loss": 0.6932, + "rewards/accuracies": 0.4749999940395355, + "rewards/chosen": -2.0969393517589197e-05, + "rewards/margins": -5.4584954341407865e-05, + "rewards/rejected": 3.361555718583986e-05, + "step": 80 + }, + { + "epoch": 0.015506547208821502, + "grad_norm": 2.1411986351013184, + "learning_rate": 5.166475315729047e-09, + "logits/chosen": -2.867920160293579, + "logits/rejected": -2.8608827590942383, + "logps/chosen": -54.94426345825195, + "logps/rejected": -51.807090759277344, + "loss": 0.6932, + "rewards/accuracies": 0.4937500059604645, + "rewards/chosen": -0.00012062456517014652, + "rewards/margins": -4.8005091230152175e-05, + "rewards/rejected": -7.261949212988839e-05, + "step": 90 + }, + { + "epoch": 0.01722949689869056, + "grad_norm": 2.2051501274108887, + "learning_rate": 5.74052812858783e-09, + "logits/chosen": -2.967716693878174, + "logits/rejected": -2.919863224029541, + "logps/chosen": -57.4222526550293, + "logps/rejected": -48.79217529296875, + "loss": 0.6931, + "rewards/accuracies": 0.48124998807907104, + "rewards/chosen": -0.00012718760990537703, + "rewards/margins": 0.00012836657697334886, + "rewards/rejected": -0.00025555412867106497, + "step": 100 + }, + { + "epoch": 0.018952446588559616, + "grad_norm": 2.246222496032715, + "learning_rate": 6.314580941446612e-09, + "logits/chosen": -2.9464218616485596, + "logits/rejected": -2.9271929264068604, + "logps/chosen": -56.66094970703125, + "logps/rejected": -51.9701042175293, + "loss": 0.6931, + "rewards/accuracies": 0.518750011920929, + "rewards/chosen": 0.00010782321623992175, + "rewards/margins": 0.00011707199882948771, + "rewards/rejected": -9.2488016889547e-06, + "step": 110 + }, + { + "epoch": 0.02067539627842867, + "grad_norm": 2.312595844268799, + "learning_rate": 6.888633754305395e-09, + "logits/chosen": -2.885158061981201, + "logits/rejected": -2.873599052429199, + "logps/chosen": -53.678932189941406, + "logps/rejected": -54.923248291015625, + "loss": 0.693, + "rewards/accuracies": 0.5687500238418579, + "rewards/chosen": 0.00021055006072856486, + "rewards/margins": 0.0002522490103729069, + "rewards/rejected": -4.1698935092426836e-05, + "step": 120 + }, + { + "epoch": 0.022398345968297727, + "grad_norm": 1.9637584686279297, + "learning_rate": 7.462686567164179e-09, + "logits/chosen": -2.932377338409424, + "logits/rejected": -2.9271585941314697, + "logps/chosen": -56.641029357910156, + "logps/rejected": -53.10710525512695, + "loss": 0.6931, + "rewards/accuracies": 0.518750011920929, + "rewards/chosen": -0.00017390199354849756, + "rewards/margins": 7.098014066286851e-06, + "rewards/rejected": -0.00018099998123943806, + "step": 130 + }, + { + "epoch": 0.024121295658166782, + "grad_norm": 2.3946006298065186, + "learning_rate": 8.036739380022962e-09, + "logits/chosen": -2.9414381980895996, + "logits/rejected": -2.9314544200897217, + "logps/chosen": -54.4839973449707, + "logps/rejected": -52.5967903137207, + "loss": 0.6932, + "rewards/accuracies": 0.4437499940395355, + "rewards/chosen": -9.17953802854754e-05, + "rewards/margins": -9.829508780967444e-05, + "rewards/rejected": 6.49970297672553e-06, + "step": 140 + }, + { + "epoch": 0.025844245348035838, + "grad_norm": 2.067854642868042, + "learning_rate": 8.610792192881745e-09, + "logits/chosen": -2.8870601654052734, + "logits/rejected": -2.874255657196045, + "logps/chosen": -53.06481170654297, + "logps/rejected": -51.183555603027344, + "loss": 0.6933, + "rewards/accuracies": 0.4437499940395355, + "rewards/chosen": -1.808160959626548e-05, + "rewards/margins": -0.0002152116794604808, + "rewards/rejected": 0.00019713006622623652, + "step": 150 + }, + { + "epoch": 0.027567195037904894, + "grad_norm": 1.8846144676208496, + "learning_rate": 9.184845005740529e-09, + "logits/chosen": -2.9312491416931152, + "logits/rejected": -2.9175572395324707, + "logps/chosen": -54.59100341796875, + "logps/rejected": -54.2659912109375, + "loss": 0.6931, + "rewards/accuracies": 0.5, + "rewards/chosen": 5.2008450438734144e-05, + "rewards/margins": 8.009701559785753e-05, + "rewards/rejected": -2.8088572435081005e-05, + "step": 160 + }, + { + "epoch": 0.02929014472777395, + "grad_norm": 2.0794315338134766, + "learning_rate": 9.758897818599312e-09, + "logits/chosen": -2.9155006408691406, + "logits/rejected": -2.901543617248535, + "logps/chosen": -56.4522705078125, + "logps/rejected": -50.75872802734375, + "loss": 0.6931, + "rewards/accuracies": 0.5375000238418579, + "rewards/chosen": 1.9222959963371977e-05, + "rewards/margins": 0.0001044358141371049, + "rewards/rejected": -8.521286508766934e-05, + "step": 170 + }, + { + "epoch": 0.031013094417643005, + "grad_norm": 2.257279872894287, + "learning_rate": 1.0332950631458094e-08, + "logits/chosen": -2.9141793251037598, + "logits/rejected": -2.898280382156372, + "logps/chosen": -57.03478240966797, + "logps/rejected": -52.420867919921875, + "loss": 0.6931, + "rewards/accuracies": 0.512499988079071, + "rewards/chosen": 5.353235246730037e-05, + "rewards/margins": 9.774983482202515e-05, + "rewards/rejected": -4.4217464164830744e-05, + "step": 180 + }, + { + "epoch": 0.03273604410751206, + "grad_norm": 2.538918972015381, + "learning_rate": 1.0907003444316877e-08, + "logits/chosen": -2.9484705924987793, + "logits/rejected": -2.9152588844299316, + "logps/chosen": -59.644805908203125, + "logps/rejected": -51.64323043823242, + "loss": 0.6933, + "rewards/accuracies": 0.46875, + "rewards/chosen": -0.000212798360735178, + "rewards/margins": -0.00027094813412986696, + "rewards/rejected": 5.8149791584583e-05, + "step": 190 + }, + { + "epoch": 0.03445899379738112, + "grad_norm": 2.2350316047668457, + "learning_rate": 1.148105625717566e-08, + "logits/chosen": -2.907341241836548, + "logits/rejected": -2.897705554962158, + "logps/chosen": -54.77196502685547, + "logps/rejected": -53.700828552246094, + "loss": 0.6931, + "rewards/accuracies": 0.5249999761581421, + "rewards/chosen": -9.717059583636001e-05, + "rewards/margins": 5.300307020661421e-05, + "rewards/rejected": -0.00015017366968095303, + "step": 200 + }, + { + "epoch": 0.03618194348725017, + "grad_norm": 2.1636910438537598, + "learning_rate": 1.2055109070034444e-08, + "logits/chosen": -2.8660168647766113, + "logits/rejected": -2.8632164001464844, + "logps/chosen": -54.080406188964844, + "logps/rejected": -56.40966033935547, + "loss": 0.6931, + "rewards/accuracies": 0.46875, + "rewards/chosen": 8.996956603368744e-05, + "rewards/margins": 3.7124355003470555e-05, + "rewards/rejected": 5.284523285808973e-05, + "step": 210 + }, + { + "epoch": 0.03790489317711923, + "grad_norm": 2.0409598350524902, + "learning_rate": 1.2629161882893224e-08, + "logits/chosen": -2.904695749282837, + "logits/rejected": -2.8822054862976074, + "logps/chosen": -53.443138122558594, + "logps/rejected": -49.99930953979492, + "loss": 0.6933, + "rewards/accuracies": 0.4625000059604645, + "rewards/chosen": -0.00015412228822242469, + "rewards/margins": -0.0002943346626125276, + "rewards/rejected": 0.0001402123598381877, + "step": 220 + }, + { + "epoch": 0.03962784286698828, + "grad_norm": 2.233654737472534, + "learning_rate": 1.3203214695752007e-08, + "logits/chosen": -2.900219202041626, + "logits/rejected": -2.8891189098358154, + "logps/chosen": -49.81378936767578, + "logps/rejected": -49.349815368652344, + "loss": 0.6932, + "rewards/accuracies": 0.48750001192092896, + "rewards/chosen": -0.00014936344814486802, + "rewards/margins": -0.00013175979256629944, + "rewards/rejected": -1.760370287229307e-05, + "step": 230 + }, + { + "epoch": 0.04135079255685734, + "grad_norm": 1.9850331544876099, + "learning_rate": 1.377726750861079e-08, + "logits/chosen": -2.8763766288757324, + "logits/rejected": -2.84673810005188, + "logps/chosen": -56.71735382080078, + "logps/rejected": -51.65503692626953, + "loss": 0.6931, + "rewards/accuracies": 0.512499988079071, + "rewards/chosen": -1.7611677321838215e-05, + "rewards/margins": 0.0001161120380857028, + "rewards/rejected": -0.00013372373359743506, + "step": 240 + }, + { + "epoch": 0.043073742246726394, + "grad_norm": 2.068523406982422, + "learning_rate": 1.4351320321469574e-08, + "logits/chosen": -2.949462413787842, + "logits/rejected": -2.9321811199188232, + "logps/chosen": -53.439109802246094, + "logps/rejected": -50.085819244384766, + "loss": 0.6931, + "rewards/accuracies": 0.48750001192092896, + "rewards/chosen": 3.4617838537087664e-05, + "rewards/margins": 1.9717685063369572e-05, + "rewards/rejected": 1.490015256422339e-05, + "step": 250 + }, + { + "epoch": 0.044796691936595454, + "grad_norm": 1.9957587718963623, + "learning_rate": 1.4925373134328357e-08, + "logits/chosen": -2.9308934211730957, + "logits/rejected": -2.9233505725860596, + "logps/chosen": -55.7227897644043, + "logps/rejected": -55.18525314331055, + "loss": 0.693, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": 0.00011331225687172264, + "rewards/margins": 0.00031197606585919857, + "rewards/rejected": -0.00019866382353939116, + "step": 260 + }, + { + "epoch": 0.046519641626464506, + "grad_norm": 2.175574541091919, + "learning_rate": 1.549942594718714e-08, + "logits/chosen": -2.8968732357025146, + "logits/rejected": -2.889274835586548, + "logps/chosen": -53.792694091796875, + "logps/rejected": -53.44324493408203, + "loss": 0.6931, + "rewards/accuracies": 0.5249999761581421, + "rewards/chosen": -0.0001622785785002634, + "rewards/margins": 7.451194687746465e-05, + "rewards/rejected": -0.00023679053992964327, + "step": 270 + }, + { + "epoch": 0.048242591316333565, + "grad_norm": 2.0508830547332764, + "learning_rate": 1.6073478760045924e-08, + "logits/chosen": -2.9564032554626465, + "logits/rejected": -2.9346067905426025, + "logps/chosen": -58.7863883972168, + "logps/rejected": -52.57866287231445, + "loss": 0.6932, + "rewards/accuracies": 0.5062500238418579, + "rewards/chosen": -0.00020536321972031146, + "rewards/margins": -7.604634447488934e-05, + "rewards/rejected": -0.0001293168606935069, + "step": 280 + }, + { + "epoch": 0.04996554100620262, + "grad_norm": 1.9876508712768555, + "learning_rate": 1.6647531572904707e-08, + "logits/chosen": -2.9049034118652344, + "logits/rejected": -2.895927667617798, + "logps/chosen": -56.95619583129883, + "logps/rejected": -53.31736373901367, + "loss": 0.6931, + "rewards/accuracies": 0.5062500238418579, + "rewards/chosen": -0.00022282273857854307, + "rewards/margins": 7.085135439410806e-05, + "rewards/rejected": -0.00029367406386882067, + "step": 290 + }, + { + "epoch": 0.051688490696071676, + "grad_norm": 2.060868501663208, + "learning_rate": 1.722158438576349e-08, + "logits/chosen": -2.853228807449341, + "logits/rejected": -2.8549416065216064, + "logps/chosen": -54.9939079284668, + "logps/rejected": -53.275054931640625, + "loss": 0.6931, + "rewards/accuracies": 0.5375000238418579, + "rewards/chosen": -2.7072605007560924e-05, + "rewards/margins": 0.00010046066017821431, + "rewards/rejected": -0.0001275332469958812, + "step": 300 + }, + { + "epoch": 0.05341144038594073, + "grad_norm": 2.057678699493408, + "learning_rate": 1.7795637198622274e-08, + "logits/chosen": -2.9021923542022705, + "logits/rejected": -2.9028067588806152, + "logps/chosen": -54.8409423828125, + "logps/rejected": -52.515342712402344, + "loss": 0.6931, + "rewards/accuracies": 0.53125, + "rewards/chosen": -7.745550101390108e-05, + "rewards/margins": 5.208273069001734e-05, + "rewards/rejected": -0.0001295382244279608, + "step": 310 + }, + { + "epoch": 0.05513439007580979, + "grad_norm": 2.3376967906951904, + "learning_rate": 1.8369690011481057e-08, + "logits/chosen": -2.887517213821411, + "logits/rejected": -2.869492769241333, + "logps/chosen": -56.575233459472656, + "logps/rejected": -48.890525817871094, + "loss": 0.6932, + "rewards/accuracies": 0.4937500059604645, + "rewards/chosen": -0.00010880133777391165, + "rewards/margins": -3.789450420299545e-05, + "rewards/rejected": -7.09068845026195e-05, + "step": 320 + }, + { + "epoch": 0.05685733976567884, + "grad_norm": 2.0127511024475098, + "learning_rate": 1.894374282433984e-08, + "logits/chosen": -2.9153316020965576, + "logits/rejected": -2.898012161254883, + "logps/chosen": -56.18683624267578, + "logps/rejected": -51.056007385253906, + "loss": 0.6929, + "rewards/accuracies": 0.625, + "rewards/chosen": 8.49324424052611e-05, + "rewards/margins": 0.00045079676783643663, + "rewards/rejected": -0.0003658643108792603, + "step": 330 + }, + { + "epoch": 0.0585802894555479, + "grad_norm": 1.9972854852676392, + "learning_rate": 1.9517795637198624e-08, + "logits/chosen": -2.883319139480591, + "logits/rejected": -2.8704676628112793, + "logps/chosen": -52.8604621887207, + "logps/rejected": -51.86638259887695, + "loss": 0.6931, + "rewards/accuracies": 0.5249999761581421, + "rewards/chosen": -0.00010438141180202365, + "rewards/margins": 0.0001349625817965716, + "rewards/rejected": -0.00023934399359859526, + "step": 340 + }, + { + "epoch": 0.06030323914541695, + "grad_norm": 2.250826597213745, + "learning_rate": 2.0091848450057404e-08, + "logits/chosen": -2.8549716472625732, + "logits/rejected": -2.82232666015625, + "logps/chosen": -57.10233688354492, + "logps/rejected": -53.84283447265625, + "loss": 0.6931, + "rewards/accuracies": 0.581250011920929, + "rewards/chosen": 4.836320294998586e-05, + "rewards/margins": 0.00014005022239871323, + "rewards/rejected": -9.168702672468498e-05, + "step": 350 + }, + { + "epoch": 0.06202618883528601, + "grad_norm": 2.1809022426605225, + "learning_rate": 2.0665901262916187e-08, + "logits/chosen": -2.9523656368255615, + "logits/rejected": -2.9341259002685547, + "logps/chosen": -56.03881072998047, + "logps/rejected": -49.349849700927734, + "loss": 0.6932, + "rewards/accuracies": 0.48750001192092896, + "rewards/chosen": -0.00038100595702417195, + "rewards/margins": -6.664998181804549e-06, + "rewards/rejected": -0.0003743409179151058, + "step": 360 + }, + { + "epoch": 0.06374913852515507, + "grad_norm": 2.0530834197998047, + "learning_rate": 2.123995407577497e-08, + "logits/chosen": -2.938208818435669, + "logits/rejected": -2.913538694381714, + "logps/chosen": -54.34980392456055, + "logps/rejected": -50.99808120727539, + "loss": 0.693, + "rewards/accuracies": 0.543749988079071, + "rewards/chosen": -0.00012077521387254819, + "rewards/margins": 0.00036179949529469013, + "rewards/rejected": -0.0004825748037546873, + "step": 370 + }, + { + "epoch": 0.06547208821502412, + "grad_norm": 1.9240037202835083, + "learning_rate": 2.1814006888633754e-08, + "logits/chosen": -2.9995782375335693, + "logits/rejected": -2.9801979064941406, + "logps/chosen": -55.090370178222656, + "logps/rejected": -51.10495376586914, + "loss": 0.693, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.0001042260555550456, + "rewards/margins": 0.0003570080443751067, + "rewards/rejected": -0.0004612340999301523, + "step": 380 + }, + { + "epoch": 0.06719503790489317, + "grad_norm": 2.231076240539551, + "learning_rate": 2.2388059701492537e-08, + "logits/chosen": -2.9322702884674072, + "logits/rejected": -2.9176416397094727, + "logps/chosen": -57.369293212890625, + "logps/rejected": -54.06401824951172, + "loss": 0.6932, + "rewards/accuracies": 0.4437499940395355, + "rewards/chosen": -0.0004567280411720276, + "rewards/margins": -0.00018372261547483504, + "rewards/rejected": -0.0002730053965933621, + "step": 390 + }, + { + "epoch": 0.06891798759476224, + "grad_norm": 1.7321640253067017, + "learning_rate": 2.296211251435132e-08, + "logits/chosen": -2.919123649597168, + "logits/rejected": -2.9075767993927, + "logps/chosen": -54.519500732421875, + "logps/rejected": -51.47249221801758, + "loss": 0.693, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.00017625563486944884, + "rewards/margins": 0.00038831119309179485, + "rewards/rejected": -0.0005645668716169894, + "step": 400 + }, + { + "epoch": 0.06891798759476224, + "eval_logits/chosen": -2.9726641178131104, + "eval_logits/rejected": -2.9690604209899902, + "eval_logps/chosen": -58.985782623291016, + "eval_logps/rejected": -62.72700500488281, + "eval_loss": 0.693112313747406, + "eval_rewards/accuracies": 0.5111523866653442, + "eval_rewards/chosen": 0.00029688214999623597, + "eval_rewards/margins": 7.089837890816852e-05, + "eval_rewards/rejected": 0.00022598376381210983, + "eval_runtime": 383.0475, + "eval_samples_per_second": 11.236, + "eval_steps_per_second": 1.405, + "step": 400 + }, + { + "epoch": 0.07064093728463129, + "grad_norm": 1.944787621498108, + "learning_rate": 2.3536165327210104e-08, + "logits/chosen": -2.9126317501068115, + "logits/rejected": -2.913689136505127, + "logps/chosen": -51.16768264770508, + "logps/rejected": -54.39495849609375, + "loss": 0.6931, + "rewards/accuracies": 0.543749988079071, + "rewards/chosen": -0.0004532871244009584, + "rewards/margins": 0.00013652675261255354, + "rewards/rejected": -0.0005898139206692576, + "step": 410 + }, + { + "epoch": 0.07236388697450034, + "grad_norm": 2.370176076889038, + "learning_rate": 2.4110218140068887e-08, + "logits/chosen": -2.899533748626709, + "logits/rejected": -2.896271228790283, + "logps/chosen": -55.39063262939453, + "logps/rejected": -53.54706573486328, + "loss": 0.693, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": -0.00029050689772702754, + "rewards/margins": 0.00030943285673856735, + "rewards/rejected": -0.0005999397253617644, + "step": 420 + }, + { + "epoch": 0.0740868366643694, + "grad_norm": 2.0748910903930664, + "learning_rate": 2.4684270952927668e-08, + "logits/chosen": -2.93806529045105, + "logits/rejected": -2.927563190460205, + "logps/chosen": -54.763023376464844, + "logps/rejected": -53.027503967285156, + "loss": 0.693, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": -4.900352723780088e-05, + "rewards/margins": 0.0003839733253698796, + "rewards/rejected": -0.00043297684169374406, + "step": 430 + }, + { + "epoch": 0.07580978635423846, + "grad_norm": 2.3829500675201416, + "learning_rate": 2.5258323765786448e-08, + "logits/chosen": -2.979527235031128, + "logits/rejected": -2.953399896621704, + "logps/chosen": -55.02161407470703, + "logps/rejected": -52.73406982421875, + "loss": 0.6927, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -5.88020084251184e-05, + "rewards/margins": 0.0008097798563539982, + "rewards/rejected": -0.000868581875693053, + "step": 440 + }, + { + "epoch": 0.07753273604410751, + "grad_norm": 2.041431427001953, + "learning_rate": 2.583237657864523e-08, + "logits/chosen": -2.9178643226623535, + "logits/rejected": -2.8964293003082275, + "logps/chosen": -57.30718231201172, + "logps/rejected": -54.427452087402344, + "loss": 0.693, + "rewards/accuracies": 0.581250011920929, + "rewards/chosen": -0.0002218265290139243, + "rewards/margins": 0.00039240572368726134, + "rewards/rejected": -0.0006142322672531009, + "step": 450 + }, + { + "epoch": 0.07925568573397657, + "grad_norm": 1.999345064163208, + "learning_rate": 2.6406429391504014e-08, + "logits/chosen": -2.8885178565979004, + "logits/rejected": -2.877211570739746, + "logps/chosen": -57.013694763183594, + "logps/rejected": -52.124473571777344, + "loss": 0.693, + "rewards/accuracies": 0.5249999761581421, + "rewards/chosen": -0.00032575431396253407, + "rewards/margins": 0.00021151344117242843, + "rewards/rejected": -0.0005372677696868777, + "step": 460 + }, + { + "epoch": 0.08097863542384562, + "grad_norm": 2.046933174133301, + "learning_rate": 2.6980482204362798e-08, + "logits/chosen": -2.9009499549865723, + "logits/rejected": -2.8764655590057373, + "logps/chosen": -54.453948974609375, + "logps/rejected": -50.358951568603516, + "loss": 0.6928, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -4.428888496477157e-05, + "rewards/margins": 0.0006898181745782495, + "rewards/rejected": -0.0007341071031987667, + "step": 470 + }, + { + "epoch": 0.08270158511371468, + "grad_norm": 2.3568708896636963, + "learning_rate": 2.755453501722158e-08, + "logits/chosen": -2.9118103981018066, + "logits/rejected": -2.904237747192383, + "logps/chosen": -54.8182373046875, + "logps/rejected": -58.23457717895508, + "loss": 0.6928, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -0.00026756085571832955, + "rewards/margins": 0.000665828469209373, + "rewards/rejected": -0.000933389354031533, + "step": 480 + }, + { + "epoch": 0.08442453480358374, + "grad_norm": 2.236090898513794, + "learning_rate": 2.8128587830080364e-08, + "logits/chosen": -2.8598079681396484, + "logits/rejected": -2.8242173194885254, + "logps/chosen": -61.14402389526367, + "logps/rejected": -50.303977966308594, + "loss": 0.6927, + "rewards/accuracies": 0.606249988079071, + "rewards/chosen": -0.0002923067077063024, + "rewards/margins": 0.0008667553775012493, + "rewards/rejected": -0.0011590620269998908, + "step": 490 + }, + { + "epoch": 0.08614748449345279, + "grad_norm": 1.9498893022537231, + "learning_rate": 2.8702640642939148e-08, + "logits/chosen": -2.8858821392059326, + "logits/rejected": -2.868650436401367, + "logps/chosen": -56.36248016357422, + "logps/rejected": -51.5005989074707, + "loss": 0.693, + "rewards/accuracies": 0.543749988079071, + "rewards/chosen": -0.0007818065350875258, + "rewards/margins": 0.00028844509506598115, + "rewards/rejected": -0.001070251688361168, + "step": 500 + }, + { + "epoch": 0.08787043418332184, + "grad_norm": 1.9891581535339355, + "learning_rate": 2.927669345579793e-08, + "logits/chosen": -2.8594257831573486, + "logits/rejected": -2.84855318069458, + "logps/chosen": -58.753883361816406, + "logps/rejected": -51.88996124267578, + "loss": 0.693, + "rewards/accuracies": 0.512499988079071, + "rewards/chosen": -0.0007896844181232154, + "rewards/margins": 0.00022462100605480373, + "rewards/rejected": -0.0010143055114895105, + "step": 510 + }, + { + "epoch": 0.08959338387319091, + "grad_norm": 1.96133291721344, + "learning_rate": 2.9850746268656714e-08, + "logits/chosen": -2.9103283882141113, + "logits/rejected": -2.893756151199341, + "logps/chosen": -56.875282287597656, + "logps/rejected": -51.322296142578125, + "loss": 0.6928, + "rewards/accuracies": 0.5062500238418579, + "rewards/chosen": -0.0010125295957550406, + "rewards/margins": 0.0006925197667442262, + "rewards/rejected": -0.001705049304291606, + "step": 520 + }, + { + "epoch": 0.09131633356305996, + "grad_norm": 1.8310251235961914, + "learning_rate": 3.0424799081515494e-08, + "logits/chosen": -2.908512592315674, + "logits/rejected": -2.877866744995117, + "logps/chosen": -56.7138671875, + "logps/rejected": -50.077171325683594, + "loss": 0.6926, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": -0.0005267611122690141, + "rewards/margins": 0.0011749586556106806, + "rewards/rejected": -0.0017017197096720338, + "step": 530 + }, + { + "epoch": 0.09303928325292901, + "grad_norm": 1.9770342111587524, + "learning_rate": 3.099885189437428e-08, + "logits/chosen": -2.8999149799346924, + "logits/rejected": -2.88718318939209, + "logps/chosen": -54.016265869140625, + "logps/rejected": -52.41388702392578, + "loss": 0.6927, + "rewards/accuracies": 0.6187499761581421, + "rewards/chosen": -0.0007578878430649638, + "rewards/margins": 0.0009833236690610647, + "rewards/rejected": -0.0017412115121260285, + "step": 540 + }, + { + "epoch": 0.09476223294279806, + "grad_norm": 2.0536069869995117, + "learning_rate": 3.157290470723307e-08, + "logits/chosen": -2.9287686347961426, + "logits/rejected": -2.9126522541046143, + "logps/chosen": -54.85844802856445, + "logps/rejected": -51.18532180786133, + "loss": 0.6926, + "rewards/accuracies": 0.59375, + "rewards/chosen": -0.0011350096901878715, + "rewards/margins": 0.0010482745710760355, + "rewards/rejected": -0.0021832843776792288, + "step": 550 + }, + { + "epoch": 0.09648518263266713, + "grad_norm": 2.260427236557007, + "learning_rate": 3.214695752009185e-08, + "logits/chosen": -2.914752721786499, + "logits/rejected": -2.904942035675049, + "logps/chosen": -53.504119873046875, + "logps/rejected": -54.135826110839844, + "loss": 0.6927, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": -0.0010297555709257722, + "rewards/margins": 0.000813187682069838, + "rewards/rejected": -0.001842943369410932, + "step": 560 + }, + { + "epoch": 0.09820813232253618, + "grad_norm": 2.097816228866577, + "learning_rate": 3.2721010332950634e-08, + "logits/chosen": -2.8879055976867676, + "logits/rejected": -2.883946180343628, + "logps/chosen": -52.45808792114258, + "logps/rejected": -53.73316192626953, + "loss": 0.6926, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -0.000989855034276843, + "rewards/margins": 0.0010882084025070071, + "rewards/rejected": -0.0020780630875378847, + "step": 570 + }, + { + "epoch": 0.09993108201240523, + "grad_norm": 1.696934461593628, + "learning_rate": 3.3295063145809414e-08, + "logits/chosen": -2.9020581245422363, + "logits/rejected": -2.8965868949890137, + "logps/chosen": -52.064788818359375, + "logps/rejected": -51.971031188964844, + "loss": 0.6928, + "rewards/accuracies": 0.4937500059604645, + "rewards/chosen": -0.0011961839627474546, + "rewards/margins": 0.0007127647986635566, + "rewards/rejected": -0.0019089489942416549, + "step": 580 + }, + { + "epoch": 0.1016540317022743, + "grad_norm": 2.097994565963745, + "learning_rate": 3.38691159586682e-08, + "logits/chosen": -2.899217367172241, + "logits/rejected": -2.885105848312378, + "logps/chosen": -55.10315704345703, + "logps/rejected": -54.668983459472656, + "loss": 0.6928, + "rewards/accuracies": 0.518750011920929, + "rewards/chosen": -0.0012872371589764953, + "rewards/margins": 0.0007017455063760281, + "rewards/rejected": -0.0019889825489372015, + "step": 590 + }, + { + "epoch": 0.10337698139214335, + "grad_norm": 2.353062868118286, + "learning_rate": 3.444316877152698e-08, + "logits/chosen": -2.901080846786499, + "logits/rejected": -2.8811850547790527, + "logps/chosen": -55.64324951171875, + "logps/rejected": -55.54071044921875, + "loss": 0.692, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.00037466268986463547, + "rewards/margins": 0.0022983322851359844, + "rewards/rejected": -0.00267299497500062, + "step": 600 + }, + { + "epoch": 0.1050999310820124, + "grad_norm": 2.102449893951416, + "learning_rate": 3.501722158438576e-08, + "logits/chosen": -2.8592283725738525, + "logits/rejected": -2.8586907386779785, + "logps/chosen": -54.448280334472656, + "logps/rejected": -53.2332649230957, + "loss": 0.6928, + "rewards/accuracies": 0.543749988079071, + "rewards/chosen": -0.0012435053940862417, + "rewards/margins": 0.0006144942017272115, + "rewards/rejected": -0.0018579994793981314, + "step": 610 + }, + { + "epoch": 0.10682288077188146, + "grad_norm": 2.2198610305786133, + "learning_rate": 3.559127439724455e-08, + "logits/chosen": -2.9471917152404785, + "logits/rejected": -2.9253733158111572, + "logps/chosen": -56.49320602416992, + "logps/rejected": -52.7685432434082, + "loss": 0.6924, + "rewards/accuracies": 0.606249988079071, + "rewards/chosen": -0.0014289176324382424, + "rewards/margins": 0.0015511448727920651, + "rewards/rejected": -0.002980062272399664, + "step": 620 + }, + { + "epoch": 0.10854583046175052, + "grad_norm": 2.3251566886901855, + "learning_rate": 3.616532721010333e-08, + "logits/chosen": -2.9427034854888916, + "logits/rejected": -2.9193663597106934, + "logps/chosen": -55.36882400512695, + "logps/rejected": -50.096778869628906, + "loss": 0.6925, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.0016340886941179633, + "rewards/margins": 0.0012800354743376374, + "rewards/rejected": -0.0029141241684556007, + "step": 630 + }, + { + "epoch": 0.11026878015161957, + "grad_norm": 2.282984972000122, + "learning_rate": 3.6739380022962115e-08, + "logits/chosen": -2.9252994060516357, + "logits/rejected": -2.922450065612793, + "logps/chosen": -53.620811462402344, + "logps/rejected": -53.71160888671875, + "loss": 0.6927, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.0018658351618796587, + "rewards/margins": 0.0008461518445983529, + "rewards/rejected": -0.002711986657232046, + "step": 640 + }, + { + "epoch": 0.11199172984148863, + "grad_norm": 2.365267515182495, + "learning_rate": 3.7313432835820895e-08, + "logits/chosen": -2.9289305210113525, + "logits/rejected": -2.9324915409088135, + "logps/chosen": -53.2087287902832, + "logps/rejected": -54.3664665222168, + "loss": 0.693, + "rewards/accuracies": 0.5375000238418579, + "rewards/chosen": -0.002466582925990224, + "rewards/margins": 0.00023730102111585438, + "rewards/rejected": -0.0027038834523409605, + "step": 650 + }, + { + "epoch": 0.11371467953135768, + "grad_norm": 2.1056787967681885, + "learning_rate": 3.788748564867968e-08, + "logits/chosen": -2.87402081489563, + "logits/rejected": -2.874648094177246, + "logps/chosen": -55.8626708984375, + "logps/rejected": -52.24543380737305, + "loss": 0.6926, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": -0.0017661073943600059, + "rewards/margins": 0.0010808532824739814, + "rewards/rejected": -0.0028469606768339872, + "step": 660 + }, + { + "epoch": 0.11543762922122675, + "grad_norm": 2.0044331550598145, + "learning_rate": 3.846153846153846e-08, + "logits/chosen": -2.8894381523132324, + "logits/rejected": -2.8855738639831543, + "logps/chosen": -54.400245666503906, + "logps/rejected": -57.206260681152344, + "loss": 0.6927, + "rewards/accuracies": 0.581250011920929, + "rewards/chosen": -0.0014983124565333128, + "rewards/margins": 0.0009925318881869316, + "rewards/rejected": -0.0024908443447202444, + "step": 670 + }, + { + "epoch": 0.1171605789110958, + "grad_norm": 2.1866514682769775, + "learning_rate": 3.903559127439725e-08, + "logits/chosen": -2.8572490215301514, + "logits/rejected": -2.834527015686035, + "logps/chosen": -55.36687088012695, + "logps/rejected": -50.750118255615234, + "loss": 0.6925, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -0.0026890942826867104, + "rewards/margins": 0.0012137128505855799, + "rewards/rejected": -0.0039028071332722902, + "step": 680 + }, + { + "epoch": 0.11888352860096485, + "grad_norm": 2.242223024368286, + "learning_rate": 3.960964408725603e-08, + "logits/chosen": -2.947134494781494, + "logits/rejected": -2.923008441925049, + "logps/chosen": -60.630027770996094, + "logps/rejected": -50.56222915649414, + "loss": 0.6923, + "rewards/accuracies": 0.59375, + "rewards/chosen": -0.001378271379508078, + "rewards/margins": 0.0016906490782275796, + "rewards/rejected": -0.0030689204577356577, + "step": 690 + }, + { + "epoch": 0.1206064782908339, + "grad_norm": 2.1509509086608887, + "learning_rate": 4.018369690011481e-08, + "logits/chosen": -2.9139673709869385, + "logits/rejected": -2.890505313873291, + "logps/chosen": -56.62127685546875, + "logps/rejected": -52.38640213012695, + "loss": 0.6921, + "rewards/accuracies": 0.6187499761581421, + "rewards/chosen": -0.0015380937838926911, + "rewards/margins": 0.0020837439224123955, + "rewards/rejected": -0.003621837589889765, + "step": 700 + }, + { + "epoch": 0.12232942798070297, + "grad_norm": 2.0617010593414307, + "learning_rate": 4.0757749712973595e-08, + "logits/chosen": -2.9065475463867188, + "logits/rejected": -2.8897900581359863, + "logps/chosen": -55.495445251464844, + "logps/rejected": -54.319091796875, + "loss": 0.6923, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -0.0023886431008577347, + "rewards/margins": 0.0016835425049066544, + "rewards/rejected": -0.004072185140103102, + "step": 710 + }, + { + "epoch": 0.12405237767057202, + "grad_norm": 2.1775104999542236, + "learning_rate": 4.1331802525832375e-08, + "logits/chosen": -2.8805248737335205, + "logits/rejected": -2.8769776821136475, + "logps/chosen": -54.568397521972656, + "logps/rejected": -53.88288116455078, + "loss": 0.6927, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": -0.002390020526945591, + "rewards/margins": 0.0008614700054749846, + "rewards/rejected": -0.0032514906488358974, + "step": 720 + }, + { + "epoch": 0.12577532736044109, + "grad_norm": 2.2033438682556152, + "learning_rate": 4.190585533869116e-08, + "logits/chosen": -2.9489083290100098, + "logits/rejected": -2.9298577308654785, + "logps/chosen": -57.70631790161133, + "logps/rejected": -52.4577751159668, + "loss": 0.6919, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.002133384346961975, + "rewards/margins": 0.002538988832384348, + "rewards/rejected": -0.004672372248023748, + "step": 730 + }, + { + "epoch": 0.12749827705031014, + "grad_norm": 2.2610881328582764, + "learning_rate": 4.247990815154994e-08, + "logits/chosen": -2.8875927925109863, + "logits/rejected": -2.867264747619629, + "logps/chosen": -55.90110397338867, + "logps/rejected": -53.75722122192383, + "loss": 0.6921, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.002014304045587778, + "rewards/margins": 0.002179005416110158, + "rewards/rejected": -0.00419330969452858, + "step": 740 + }, + { + "epoch": 0.1292212267401792, + "grad_norm": 2.137699842453003, + "learning_rate": 4.305396096440873e-08, + "logits/chosen": -2.997771978378296, + "logits/rejected": -2.9786887168884277, + "logps/chosen": -57.344635009765625, + "logps/rejected": -53.46533966064453, + "loss": 0.6913, + "rewards/accuracies": 0.65625, + "rewards/chosen": -0.0014387632254511118, + "rewards/margins": 0.0038202754221856594, + "rewards/rejected": -0.0052590384148061275, + "step": 750 + }, + { + "epoch": 0.13094417643004824, + "grad_norm": 2.469003200531006, + "learning_rate": 4.362801377726751e-08, + "logits/chosen": -2.907745838165283, + "logits/rejected": -2.879517078399658, + "logps/chosen": -55.790306091308594, + "logps/rejected": -49.0444450378418, + "loss": 0.6912, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -0.00264460826292634, + "rewards/margins": 0.003907923586666584, + "rewards/rejected": -0.006552531383931637, + "step": 760 + }, + { + "epoch": 0.1326671261199173, + "grad_norm": 2.072821617126465, + "learning_rate": 4.420206659012629e-08, + "logits/chosen": -2.930954694747925, + "logits/rejected": -2.917003631591797, + "logps/chosen": -54.62306594848633, + "logps/rejected": -52.37617874145508, + "loss": 0.6914, + "rewards/accuracies": 0.606249988079071, + "rewards/chosen": -0.0027048583142459393, + "rewards/margins": 0.0036215726286172867, + "rewards/rejected": -0.0063264318741858006, + "step": 770 + }, + { + "epoch": 0.13439007580978635, + "grad_norm": 2.3292832374572754, + "learning_rate": 4.4776119402985075e-08, + "logits/chosen": -2.929471254348755, + "logits/rejected": -2.9057729244232178, + "logps/chosen": -55.01372528076172, + "logps/rejected": -51.13895797729492, + "loss": 0.6912, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": -0.0043188584968447685, + "rewards/margins": 0.0038614545483142138, + "rewards/rejected": -0.008180314674973488, + "step": 780 + }, + { + "epoch": 0.1361130254996554, + "grad_norm": 2.2163500785827637, + "learning_rate": 4.5350172215843855e-08, + "logits/chosen": -2.927473545074463, + "logits/rejected": -2.9015581607818604, + "logps/chosen": -55.29557418823242, + "logps/rejected": -53.673065185546875, + "loss": 0.6909, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -0.0033913864754140377, + "rewards/margins": 0.0045530120842158794, + "rewards/rejected": -0.007944399490952492, + "step": 790 + }, + { + "epoch": 0.13783597518952448, + "grad_norm": 2.277212381362915, + "learning_rate": 4.592422502870264e-08, + "logits/chosen": -2.8656978607177734, + "logits/rejected": -2.8462979793548584, + "logps/chosen": -56.265106201171875, + "logps/rejected": -55.311790466308594, + "loss": 0.6923, + "rewards/accuracies": 0.581250011920929, + "rewards/chosen": -0.005564314313232899, + "rewards/margins": 0.001834776485338807, + "rewards/rejected": -0.0073990910314023495, + "step": 800 + }, + { + "epoch": 0.13783597518952448, + "eval_logits/chosen": -2.970120668411255, + "eval_logits/rejected": -2.966658115386963, + "eval_logps/chosen": -58.77965545654297, + "eval_logps/rejected": -62.625755310058594, + "eval_loss": 0.6925970315933228, + "eval_rewards/accuracies": 0.5492565035820007, + "eval_rewards/chosen": 0.0023581732530146837, + "eval_rewards/margins": 0.0011197492713108659, + "eval_rewards/rejected": 0.0012384242145344615, + "eval_runtime": 382.8666, + "eval_samples_per_second": 11.242, + "eval_steps_per_second": 1.405, + "step": 800 + }, + { + "epoch": 0.13955892487939353, + "grad_norm": 2.4686055183410645, + "learning_rate": 4.649827784156142e-08, + "logits/chosen": -2.8969507217407227, + "logits/rejected": -2.8752167224884033, + "logps/chosen": -57.342254638671875, + "logps/rejected": -55.21208572387695, + "loss": 0.6917, + "rewards/accuracies": 0.543749988079071, + "rewards/chosen": -0.004046624060720205, + "rewards/margins": 0.002936845412477851, + "rewards/rejected": -0.006983468774706125, + "step": 810 + }, + { + "epoch": 0.14128187456926258, + "grad_norm": 2.2558839321136475, + "learning_rate": 4.707233065442021e-08, + "logits/chosen": -2.9297451972961426, + "logits/rejected": -2.9120051860809326, + "logps/chosen": -53.40732955932617, + "logps/rejected": -51.31257247924805, + "loss": 0.6918, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": -0.005798877216875553, + "rewards/margins": 0.002780771581456065, + "rewards/rejected": -0.008579649031162262, + "step": 820 + }, + { + "epoch": 0.14300482425913164, + "grad_norm": 2.1874077320098877, + "learning_rate": 4.764638346727899e-08, + "logits/chosen": -2.918710231781006, + "logits/rejected": -2.902024984359741, + "logps/chosen": -56.356163024902344, + "logps/rejected": -54.185760498046875, + "loss": 0.6903, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.004841519054025412, + "rewards/margins": 0.005869493819773197, + "rewards/rejected": -0.010711013339459896, + "step": 830 + }, + { + "epoch": 0.1447277739490007, + "grad_norm": 2.145451784133911, + "learning_rate": 4.8220436280137775e-08, + "logits/chosen": -2.951972484588623, + "logits/rejected": -2.930785894393921, + "logps/chosen": -55.628273010253906, + "logps/rejected": -50.5267333984375, + "loss": 0.6905, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -0.005067192949354649, + "rewards/margins": 0.005340776406228542, + "rewards/rejected": -0.010407969355583191, + "step": 840 + }, + { + "epoch": 0.14645072363886974, + "grad_norm": 2.073517322540283, + "learning_rate": 4.8794489092996555e-08, + "logits/chosen": -2.890193223953247, + "logits/rejected": -2.8816397190093994, + "logps/chosen": -52.18341827392578, + "logps/rejected": -55.33275604248047, + "loss": 0.6918, + "rewards/accuracies": 0.5562499761581421, + "rewards/chosen": -0.006669786758720875, + "rewards/margins": 0.002874907338991761, + "rewards/rejected": -0.009544694796204567, + "step": 850 + }, + { + "epoch": 0.1481736733287388, + "grad_norm": 2.4177420139312744, + "learning_rate": 4.9368541905855335e-08, + "logits/chosen": -2.915323495864868, + "logits/rejected": -2.901231050491333, + "logps/chosen": -54.65660858154297, + "logps/rejected": -52.22821807861328, + "loss": 0.6905, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -0.007540437392890453, + "rewards/margins": 0.005416450090706348, + "rewards/rejected": -0.012956887483596802, + "step": 860 + }, + { + "epoch": 0.14989662301860784, + "grad_norm": 1.7828757762908936, + "learning_rate": 4.994259471871412e-08, + "logits/chosen": -2.940559148788452, + "logits/rejected": -2.9429688453674316, + "logps/chosen": -52.750709533691406, + "logps/rejected": -54.044281005859375, + "loss": 0.6911, + "rewards/accuracies": 0.53125, + "rewards/chosen": -0.008697355166077614, + "rewards/margins": 0.0041740224696695805, + "rewards/rejected": -0.012871377170085907, + "step": 870 + }, + { + "epoch": 0.15161957270847692, + "grad_norm": 1.9108960628509521, + "learning_rate": 5.0516647531572895e-08, + "logits/chosen": -2.8837485313415527, + "logits/rejected": -2.863524913787842, + "logps/chosen": -54.0827751159668, + "logps/rejected": -51.63903045654297, + "loss": 0.6893, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -0.006638042628765106, + "rewards/margins": 0.007870601490139961, + "rewards/rejected": -0.014508644118905067, + "step": 880 + }, + { + "epoch": 0.15334252239834598, + "grad_norm": 2.07565975189209, + "learning_rate": 5.109070034443168e-08, + "logits/chosen": -2.908128499984741, + "logits/rejected": -2.8816587924957275, + "logps/chosen": -60.31031036376953, + "logps/rejected": -54.7435302734375, + "loss": 0.6894, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -0.0049650766886770725, + "rewards/margins": 0.007679118309170008, + "rewards/rejected": -0.012644194066524506, + "step": 890 + }, + { + "epoch": 0.15506547208821503, + "grad_norm": 2.0747740268707275, + "learning_rate": 5.166475315729046e-08, + "logits/chosen": -2.927508592605591, + "logits/rejected": -2.921992540359497, + "logps/chosen": -55.90435028076172, + "logps/rejected": -52.45256805419922, + "loss": 0.691, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.008611300960183144, + "rewards/margins": 0.004552994389086962, + "rewards/rejected": -0.013164294883608818, + "step": 900 + }, + { + "epoch": 0.15678842177808408, + "grad_norm": 2.0262808799743652, + "learning_rate": 5.223880597014925e-08, + "logits/chosen": -2.904489040374756, + "logits/rejected": -2.902407646179199, + "logps/chosen": -53.1060676574707, + "logps/rejected": -53.28779220581055, + "loss": 0.6914, + "rewards/accuracies": 0.543749988079071, + "rewards/chosen": -0.010441782884299755, + "rewards/margins": 0.0036925789900124073, + "rewards/rejected": -0.01413436233997345, + "step": 910 + }, + { + "epoch": 0.15851137146795313, + "grad_norm": 2.245786428451538, + "learning_rate": 5.281285878300803e-08, + "logits/chosen": -2.928288459777832, + "logits/rejected": -2.896723985671997, + "logps/chosen": -56.58086013793945, + "logps/rejected": -50.05290603637695, + "loss": 0.6894, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -0.010822747834026814, + "rewards/margins": 0.007769578602164984, + "rewards/rejected": -0.01859232783317566, + "step": 920 + }, + { + "epoch": 0.16023432115782218, + "grad_norm": 2.340181827545166, + "learning_rate": 5.3386911595866815e-08, + "logits/chosen": -2.9540724754333496, + "logits/rejected": -2.9424567222595215, + "logps/chosen": -54.608428955078125, + "logps/rejected": -54.735633850097656, + "loss": 0.6899, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -0.01000906340777874, + "rewards/margins": 0.006677950266748667, + "rewards/rejected": -0.01668701320886612, + "step": 930 + }, + { + "epoch": 0.16195727084769124, + "grad_norm": 2.0220203399658203, + "learning_rate": 5.3960964408725595e-08, + "logits/chosen": -2.9646830558776855, + "logits/rejected": -2.937711238861084, + "logps/chosen": -62.19846725463867, + "logps/rejected": -56.09409713745117, + "loss": 0.6898, + "rewards/accuracies": 0.53125, + "rewards/chosen": -0.008925000205636024, + "rewards/margins": 0.0069125862792134285, + "rewards/rejected": -0.01583758369088173, + "step": 940 + }, + { + "epoch": 0.16368022053756032, + "grad_norm": 2.1144919395446777, + "learning_rate": 5.4535017221584375e-08, + "logits/chosen": -2.8292360305786133, + "logits/rejected": -2.8140792846679688, + "logps/chosen": -57.392913818359375, + "logps/rejected": -56.34737014770508, + "loss": 0.6908, + "rewards/accuracies": 0.543749988079071, + "rewards/chosen": -0.014584523625671864, + "rewards/margins": 0.005001784302294254, + "rewards/rejected": -0.019586309790611267, + "step": 950 + }, + { + "epoch": 0.16540317022742937, + "grad_norm": 2.0776162147521973, + "learning_rate": 5.510907003444316e-08, + "logits/chosen": -2.8197903633117676, + "logits/rejected": -2.8229846954345703, + "logps/chosen": -53.091064453125, + "logps/rejected": -56.24619674682617, + "loss": 0.6928, + "rewards/accuracies": 0.5062500238418579, + "rewards/chosen": -0.015429055318236351, + "rewards/margins": 0.0008552552899345756, + "rewards/rejected": -0.016284313052892685, + "step": 960 + }, + { + "epoch": 0.16712611991729842, + "grad_norm": 2.197368621826172, + "learning_rate": 5.568312284730194e-08, + "logits/chosen": -2.9082279205322266, + "logits/rejected": -2.881479501724243, + "logps/chosen": -62.6414794921875, + "logps/rejected": -52.523406982421875, + "loss": 0.6912, + "rewards/accuracies": 0.53125, + "rewards/chosen": -0.013698337599635124, + "rewards/margins": 0.0042149461805820465, + "rewards/rejected": -0.01791328564286232, + "step": 970 + }, + { + "epoch": 0.16884906960716747, + "grad_norm": 2.396287441253662, + "learning_rate": 5.625717566016073e-08, + "logits/chosen": -2.9761950969696045, + "logits/rejected": -2.95814847946167, + "logps/chosen": -58.28235626220703, + "logps/rejected": -55.292930603027344, + "loss": 0.689, + "rewards/accuracies": 0.6187499761581421, + "rewards/chosen": -0.008706005290150642, + "rewards/margins": 0.0086164940148592, + "rewards/rejected": -0.01732250303030014, + "step": 980 + }, + { + "epoch": 0.17057201929703653, + "grad_norm": 2.2035701274871826, + "learning_rate": 5.683122847301951e-08, + "logits/chosen": -2.897202730178833, + "logits/rejected": -2.878493309020996, + "logps/chosen": -56.3300666809082, + "logps/rejected": -53.285552978515625, + "loss": 0.691, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.015088607557117939, + "rewards/margins": 0.004525421187281609, + "rewards/rejected": -0.019614029675722122, + "step": 990 + }, + { + "epoch": 0.17229496898690558, + "grad_norm": 1.9301676750183105, + "learning_rate": 5.7405281285878295e-08, + "logits/chosen": -2.8635621070861816, + "logits/rejected": -2.841841220855713, + "logps/chosen": -59.418907165527344, + "logps/rejected": -52.82847213745117, + "loss": 0.6902, + "rewards/accuracies": 0.606249988079071, + "rewards/chosen": -0.012757417745888233, + "rewards/margins": 0.006062129978090525, + "rewards/rejected": -0.018819546326994896, + "step": 1000 + }, + { + "epoch": 0.17401791867677463, + "grad_norm": 2.214897632598877, + "learning_rate": 5.7979334098737075e-08, + "logits/chosen": -2.823845863342285, + "logits/rejected": -2.8300139904022217, + "logps/chosen": -56.06037139892578, + "logps/rejected": -58.37239456176758, + "loss": 0.6918, + "rewards/accuracies": 0.5687500238418579, + "rewards/chosen": -0.017535416409373283, + "rewards/margins": 0.0029864010866731405, + "rewards/rejected": -0.020521817728877068, + "step": 1010 + }, + { + "epoch": 0.17574086836664368, + "grad_norm": 2.2331719398498535, + "learning_rate": 5.855338691159586e-08, + "logits/chosen": -2.9262149333953857, + "logits/rejected": -2.902949333190918, + "logps/chosen": -59.038734436035156, + "logps/rejected": -55.38860321044922, + "loss": 0.6868, + "rewards/accuracies": 0.65625, + "rewards/chosen": -0.008785584941506386, + "rewards/margins": 0.013060608878731728, + "rewards/rejected": -0.021846191957592964, + "step": 1020 + }, + { + "epoch": 0.17746381805651276, + "grad_norm": 2.039205312728882, + "learning_rate": 5.912743972445464e-08, + "logits/chosen": -2.9541501998901367, + "logits/rejected": -2.934560775756836, + "logps/chosen": -56.76179122924805, + "logps/rejected": -53.00408935546875, + "loss": 0.6899, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.013884862884879112, + "rewards/margins": 0.006750473286956549, + "rewards/rejected": -0.02063533291220665, + "step": 1030 + }, + { + "epoch": 0.17918676774638181, + "grad_norm": 2.338857889175415, + "learning_rate": 5.970149253731343e-08, + "logits/chosen": -2.9137110710144043, + "logits/rejected": -2.9044034481048584, + "logps/chosen": -55.28099822998047, + "logps/rejected": -57.13520050048828, + "loss": 0.6923, + "rewards/accuracies": 0.4937500059604645, + "rewards/chosen": -0.019616015255451202, + "rewards/margins": 0.00203714263625443, + "rewards/rejected": -0.021653154864907265, + "step": 1040 + }, + { + "epoch": 0.18090971743625087, + "grad_norm": 2.2119388580322266, + "learning_rate": 6.02755453501722e-08, + "logits/chosen": -2.856088161468506, + "logits/rejected": -2.824749708175659, + "logps/chosen": -59.729095458984375, + "logps/rejected": -52.37529754638672, + "loss": 0.6887, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.01591629534959793, + "rewards/margins": 0.009324030950665474, + "rewards/rejected": -0.025240326300263405, + "step": 1050 + }, + { + "epoch": 0.18263266712611992, + "grad_norm": 2.2295567989349365, + "learning_rate": 6.084959816303099e-08, + "logits/chosen": -2.873307704925537, + "logits/rejected": -2.855834484100342, + "logps/chosen": -59.4355354309082, + "logps/rejected": -56.32249069213867, + "loss": 0.6916, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": -0.017835207283496857, + "rewards/margins": 0.0034090355038642883, + "rewards/rejected": -0.021244242787361145, + "step": 1060 + }, + { + "epoch": 0.18435561681598897, + "grad_norm": 2.0987818241119385, + "learning_rate": 6.142365097588978e-08, + "logits/chosen": -2.9532623291015625, + "logits/rejected": -2.924928903579712, + "logps/chosen": -59.63908767700195, + "logps/rejected": -54.68932342529297, + "loss": 0.6887, + "rewards/accuracies": 0.59375, + "rewards/chosen": -0.01841794326901436, + "rewards/margins": 0.009393568150699139, + "rewards/rejected": -0.027811508625745773, + "step": 1070 + }, + { + "epoch": 0.18607856650585802, + "grad_norm": 2.119971513748169, + "learning_rate": 6.199770378874856e-08, + "logits/chosen": -2.9519741535186768, + "logits/rejected": -2.933764934539795, + "logps/chosen": -59.26136016845703, + "logps/rejected": -53.21483612060547, + "loss": 0.6895, + "rewards/accuracies": 0.606249988079071, + "rewards/chosen": -0.021389838308095932, + "rewards/margins": 0.007719496730715036, + "rewards/rejected": -0.02910933457314968, + "step": 1080 + }, + { + "epoch": 0.18780151619572708, + "grad_norm": 2.250812292098999, + "learning_rate": 6.257175660160735e-08, + "logits/chosen": -2.864903211593628, + "logits/rejected": -2.863048553466797, + "logps/chosen": -55.70839309692383, + "logps/rejected": -54.59096145629883, + "loss": 0.6915, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -0.023310277611017227, + "rewards/margins": 0.003674765583127737, + "rewards/rejected": -0.026985038071870804, + "step": 1090 + }, + { + "epoch": 0.18952446588559613, + "grad_norm": 1.9611529111862183, + "learning_rate": 6.314580941446614e-08, + "logits/chosen": -2.88694429397583, + "logits/rejected": -2.8873910903930664, + "logps/chosen": -54.11775588989258, + "logps/rejected": -57.08624267578125, + "loss": 0.6886, + "rewards/accuracies": 0.581250011920929, + "rewards/chosen": -0.018750367686152458, + "rewards/margins": 0.009554053656756878, + "rewards/rejected": -0.02830442227423191, + "step": 1100 + }, + { + "epoch": 0.1912474155754652, + "grad_norm": 2.4082181453704834, + "learning_rate": 6.371986222732492e-08, + "logits/chosen": -2.9054970741271973, + "logits/rejected": -2.9155497550964355, + "logps/chosen": -56.35832595825195, + "logps/rejected": -57.75475311279297, + "loss": 0.6914, + "rewards/accuracies": 0.543749988079071, + "rewards/chosen": -0.025233492255210876, + "rewards/margins": 0.004043240565806627, + "rewards/rejected": -0.029276732355356216, + "step": 1110 + }, + { + "epoch": 0.19297036526533426, + "grad_norm": 2.127530574798584, + "learning_rate": 6.42939150401837e-08, + "logits/chosen": -2.9269912242889404, + "logits/rejected": -2.910547971725464, + "logps/chosen": -59.35639190673828, + "logps/rejected": -55.61716842651367, + "loss": 0.6893, + "rewards/accuracies": 0.5687500238418579, + "rewards/chosen": -0.023090193048119545, + "rewards/margins": 0.008246874436736107, + "rewards/rejected": -0.03133706375956535, + "step": 1120 + }, + { + "epoch": 0.1946933149552033, + "grad_norm": 2.0124289989471436, + "learning_rate": 6.486796785304248e-08, + "logits/chosen": -2.945962429046631, + "logits/rejected": -2.9308059215545654, + "logps/chosen": -54.93196487426758, + "logps/rejected": -56.35882568359375, + "loss": 0.6874, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -0.024388344958424568, + "rewards/margins": 0.012135234661400318, + "rewards/rejected": -0.03652357682585716, + "step": 1130 + }, + { + "epoch": 0.19641626464507236, + "grad_norm": 2.5086684226989746, + "learning_rate": 6.544202066590127e-08, + "logits/chosen": -2.9003541469573975, + "logits/rejected": -2.878969669342041, + "logps/chosen": -59.991233825683594, + "logps/rejected": -55.50764083862305, + "loss": 0.688, + "rewards/accuracies": 0.581250011920929, + "rewards/chosen": -0.02201252616941929, + "rewards/margins": 0.010831797495484352, + "rewards/rejected": -0.03284432366490364, + "step": 1140 + }, + { + "epoch": 0.19813921433494142, + "grad_norm": 2.2725441455841064, + "learning_rate": 6.601607347876004e-08, + "logits/chosen": -2.8616178035736084, + "logits/rejected": -2.8515572547912598, + "logps/chosen": -57.003143310546875, + "logps/rejected": -56.3914680480957, + "loss": 0.6879, + "rewards/accuracies": 0.6187499761581421, + "rewards/chosen": -0.023792926222085953, + "rewards/margins": 0.01105764415115118, + "rewards/rejected": -0.034850575029850006, + "step": 1150 + }, + { + "epoch": 0.19986216402481047, + "grad_norm": 1.9798253774642944, + "learning_rate": 6.659012629161883e-08, + "logits/chosen": -2.927191734313965, + "logits/rejected": -2.920241117477417, + "logps/chosen": -57.67218780517578, + "logps/rejected": -56.62792205810547, + "loss": 0.6874, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -0.019430024549365044, + "rewards/margins": 0.011981760151684284, + "rewards/rejected": -0.0314117856323719, + "step": 1160 + }, + { + "epoch": 0.20158511371467952, + "grad_norm": 1.8830546140670776, + "learning_rate": 6.716417910447762e-08, + "logits/chosen": -2.8572309017181396, + "logits/rejected": -2.848026752471924, + "logps/chosen": -55.777008056640625, + "logps/rejected": -56.994873046875, + "loss": 0.6899, + "rewards/accuracies": 0.518750011920929, + "rewards/chosen": -0.028517460450530052, + "rewards/margins": 0.0071545811370015144, + "rewards/rejected": -0.03567204624414444, + "step": 1170 + }, + { + "epoch": 0.2033080634045486, + "grad_norm": 2.2484617233276367, + "learning_rate": 6.77382319173364e-08, + "logits/chosen": -2.835697889328003, + "logits/rejected": -2.8139901161193848, + "logps/chosen": -56.77092361450195, + "logps/rejected": -54.284942626953125, + "loss": 0.6851, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.03087441995739937, + "rewards/margins": 0.01692943647503853, + "rewards/rejected": -0.0478038564324379, + "step": 1180 + }, + { + "epoch": 0.20503101309441765, + "grad_norm": 2.282123327255249, + "learning_rate": 6.831228473019518e-08, + "logits/chosen": -2.944603443145752, + "logits/rejected": -2.9229369163513184, + "logps/chosen": -61.991363525390625, + "logps/rejected": -53.7884521484375, + "loss": 0.6854, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.029348094016313553, + "rewards/margins": 0.01629854366183281, + "rewards/rejected": -0.04564663767814636, + "step": 1190 + }, + { + "epoch": 0.2067539627842867, + "grad_norm": 1.9270927906036377, + "learning_rate": 6.888633754305396e-08, + "logits/chosen": -2.847102403640747, + "logits/rejected": -2.8369410037994385, + "logps/chosen": -57.47083282470703, + "logps/rejected": -54.379608154296875, + "loss": 0.6901, + "rewards/accuracies": 0.518750011920929, + "rewards/chosen": -0.03375508263707161, + "rewards/margins": 0.006854252424091101, + "rewards/rejected": -0.04060933366417885, + "step": 1200 + }, + { + "epoch": 0.2067539627842867, + "eval_logits/chosen": -2.961275577545166, + "eval_logits/rejected": -2.9578850269317627, + "eval_logps/chosen": -59.81459426879883, + "eval_logps/rejected": -64.08273315429688, + "eval_loss": 0.6906687617301941, + "eval_rewards/accuracies": 0.5697026252746582, + "eval_rewards/chosen": -0.007991200312972069, + "eval_rewards/margins": 0.005340192466974258, + "eval_rewards/rejected": -0.013331393711268902, + "eval_runtime": 382.9771, + "eval_samples_per_second": 11.238, + "eval_steps_per_second": 1.405, + "step": 1200 + }, + { + "epoch": 0.20847691247415576, + "grad_norm": 2.3979392051696777, + "learning_rate": 6.946039035591275e-08, + "logits/chosen": -2.911381959915161, + "logits/rejected": -2.8858389854431152, + "logps/chosen": -58.13728713989258, + "logps/rejected": -57.347434997558594, + "loss": 0.6857, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -0.03157086297869682, + "rewards/margins": 0.015606355853378773, + "rewards/rejected": -0.04717721790075302, + "step": 1210 + }, + { + "epoch": 0.2101998621640248, + "grad_norm": 2.074423313140869, + "learning_rate": 7.003444316877152e-08, + "logits/chosen": -2.9131548404693604, + "logits/rejected": -2.8910086154937744, + "logps/chosen": -57.07306671142578, + "logps/rejected": -56.53594970703125, + "loss": 0.687, + "rewards/accuracies": 0.543749988079071, + "rewards/chosen": -0.033832598477602005, + "rewards/margins": 0.013145034201443195, + "rewards/rejected": -0.046977631747722626, + "step": 1220 + }, + { + "epoch": 0.21192281185389386, + "grad_norm": 2.4180092811584473, + "learning_rate": 7.060849598163031e-08, + "logits/chosen": -2.979348659515381, + "logits/rejected": -2.9486043453216553, + "logps/chosen": -60.274391174316406, + "logps/rejected": -56.62663650512695, + "loss": 0.6835, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -0.03272354230284691, + "rewards/margins": 0.02001432329416275, + "rewards/rejected": -0.05273786187171936, + "step": 1230 + }, + { + "epoch": 0.2136457615437629, + "grad_norm": 2.1716294288635254, + "learning_rate": 7.11825487944891e-08, + "logits/chosen": -2.8780722618103027, + "logits/rejected": -2.868040084838867, + "logps/chosen": -57.13683319091797, + "logps/rejected": -57.61468505859375, + "loss": 0.6873, + "rewards/accuracies": 0.6187499761581421, + "rewards/chosen": -0.041347816586494446, + "rewards/margins": 0.012507520616054535, + "rewards/rejected": -0.05385533720254898, + "step": 1240 + }, + { + "epoch": 0.21536871123363197, + "grad_norm": 2.6842474937438965, + "learning_rate": 7.175660160734788e-08, + "logits/chosen": -2.9559080600738525, + "logits/rejected": -2.925466775894165, + "logps/chosen": -59.06212615966797, + "logps/rejected": -56.11531448364258, + "loss": 0.6855, + "rewards/accuracies": 0.518750011920929, + "rewards/chosen": -0.038944486528635025, + "rewards/margins": 0.01640475168824196, + "rewards/rejected": -0.05534924194216728, + "step": 1250 + }, + { + "epoch": 0.21709166092350105, + "grad_norm": 2.1675713062286377, + "learning_rate": 7.233065442020666e-08, + "logits/chosen": -2.844125747680664, + "logits/rejected": -2.8245441913604736, + "logps/chosen": -58.59064483642578, + "logps/rejected": -55.4991569519043, + "loss": 0.685, + "rewards/accuracies": 0.59375, + "rewards/chosen": -0.041406117379665375, + "rewards/margins": 0.017199214547872543, + "rewards/rejected": -0.05860533565282822, + "step": 1260 + }, + { + "epoch": 0.2188146106133701, + "grad_norm": 2.3532440662384033, + "learning_rate": 7.290470723306544e-08, + "logits/chosen": -2.880176067352295, + "logits/rejected": -2.8723835945129395, + "logps/chosen": -57.689796447753906, + "logps/rejected": -59.45355224609375, + "loss": 0.6895, + "rewards/accuracies": 0.59375, + "rewards/chosen": -0.04507274553179741, + "rewards/margins": 0.008155112154781818, + "rewards/rejected": -0.05322786048054695, + "step": 1270 + }, + { + "epoch": 0.22053756030323915, + "grad_norm": 2.100616693496704, + "learning_rate": 7.347876004592423e-08, + "logits/chosen": -2.905412197113037, + "logits/rejected": -2.913015365600586, + "logps/chosen": -56.19207763671875, + "logps/rejected": -64.13707733154297, + "loss": 0.6896, + "rewards/accuracies": 0.5249999761581421, + "rewards/chosen": -0.04303373023867607, + "rewards/margins": 0.008113773539662361, + "rewards/rejected": -0.05114750191569328, + "step": 1280 + }, + { + "epoch": 0.2222605099931082, + "grad_norm": 2.2928290367126465, + "learning_rate": 7.405281285878302e-08, + "logits/chosen": -2.8585543632507324, + "logits/rejected": -2.8368325233459473, + "logps/chosen": -59.599952697753906, + "logps/rejected": -55.80320358276367, + "loss": 0.6844, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -0.03863871842622757, + "rewards/margins": 0.018526822328567505, + "rewards/rejected": -0.057165540754795074, + "step": 1290 + }, + { + "epoch": 0.22398345968297725, + "grad_norm": 2.045578718185425, + "learning_rate": 7.462686567164179e-08, + "logits/chosen": -2.904303550720215, + "logits/rejected": -2.8966355323791504, + "logps/chosen": -55.30950927734375, + "logps/rejected": -59.39851760864258, + "loss": 0.6894, + "rewards/accuracies": 0.5562499761581421, + "rewards/chosen": -0.0525943860411644, + "rewards/margins": 0.008315442129969597, + "rewards/rejected": -0.060909826308488846, + "step": 1300 + }, + { + "epoch": 0.2257064093728463, + "grad_norm": 2.3341314792633057, + "learning_rate": 7.520091848450058e-08, + "logits/chosen": -2.90086030960083, + "logits/rejected": -2.877695083618164, + "logps/chosen": -58.40484619140625, + "logps/rejected": -55.46175003051758, + "loss": 0.6821, + "rewards/accuracies": 0.581250011920929, + "rewards/chosen": -0.044036369770765305, + "rewards/margins": 0.023425104096531868, + "rewards/rejected": -0.06746147572994232, + "step": 1310 + }, + { + "epoch": 0.22742935906271536, + "grad_norm": 2.3799657821655273, + "learning_rate": 7.577497129735936e-08, + "logits/chosen": -2.9232773780822754, + "logits/rejected": -2.9070727825164795, + "logps/chosen": -60.878334045410156, + "logps/rejected": -56.428741455078125, + "loss": 0.6847, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.04339348524808884, + "rewards/margins": 0.017966218292713165, + "rewards/rejected": -0.061359703540802, + "step": 1320 + }, + { + "epoch": 0.22915230875258444, + "grad_norm": 2.3074443340301514, + "learning_rate": 7.634902411021814e-08, + "logits/chosen": -2.9480531215667725, + "logits/rejected": -2.924940586090088, + "logps/chosen": -60.75572967529297, + "logps/rejected": -56.17448806762695, + "loss": 0.6844, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.04371759295463562, + "rewards/margins": 0.01845361292362213, + "rewards/rejected": -0.06217120215296745, + "step": 1330 + }, + { + "epoch": 0.2308752584424535, + "grad_norm": 2.1518850326538086, + "learning_rate": 7.692307692307692e-08, + "logits/chosen": -2.841428279876709, + "logits/rejected": -2.8257040977478027, + "logps/chosen": -60.455482482910156, + "logps/rejected": -57.828460693359375, + "loss": 0.6864, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -0.04494331032037735, + "rewards/margins": 0.014719230122864246, + "rewards/rejected": -0.05966253951191902, + "step": 1340 + }, + { + "epoch": 0.23259820813232254, + "grad_norm": 2.1697258949279785, + "learning_rate": 7.749712973593571e-08, + "logits/chosen": -2.8844552040100098, + "logits/rejected": -2.867018461227417, + "logps/chosen": -60.273338317871094, + "logps/rejected": -59.45403289794922, + "loss": 0.6899, + "rewards/accuracies": 0.5062500238418579, + "rewards/chosen": -0.05731966346502304, + "rewards/margins": 0.007771348115056753, + "rewards/rejected": -0.06509101390838623, + "step": 1350 + }, + { + "epoch": 0.2343211578221916, + "grad_norm": 2.2805898189544678, + "learning_rate": 7.80711825487945e-08, + "logits/chosen": -2.8427302837371826, + "logits/rejected": -2.831007957458496, + "logps/chosen": -61.870033264160156, + "logps/rejected": -61.115379333496094, + "loss": 0.6899, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": -0.05456281825900078, + "rewards/margins": 0.00770781422033906, + "rewards/rejected": -0.0622706413269043, + "step": 1360 + }, + { + "epoch": 0.23604410751206065, + "grad_norm": 2.7378504276275635, + "learning_rate": 7.864523536165327e-08, + "logits/chosen": -2.927980899810791, + "logits/rejected": -2.9119479656219482, + "logps/chosen": -59.125083923339844, + "logps/rejected": -59.64502716064453, + "loss": 0.6837, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -0.05251600593328476, + "rewards/margins": 0.020127257332205772, + "rewards/rejected": -0.07264326512813568, + "step": 1370 + }, + { + "epoch": 0.2377670572019297, + "grad_norm": 2.2284791469573975, + "learning_rate": 7.921928817451206e-08, + "logits/chosen": -2.870211362838745, + "logits/rejected": -2.861656904220581, + "logps/chosen": -61.35654830932617, + "logps/rejected": -57.8406982421875, + "loss": 0.687, + "rewards/accuracies": 0.581250011920929, + "rewards/chosen": -0.05080818384885788, + "rewards/margins": 0.01347988098859787, + "rewards/rejected": -0.06428805738687515, + "step": 1380 + }, + { + "epoch": 0.23949000689179875, + "grad_norm": 2.3655896186828613, + "learning_rate": 7.979334098737084e-08, + "logits/chosen": -2.9632561206817627, + "logits/rejected": -2.9431121349334717, + "logps/chosen": -62.521148681640625, + "logps/rejected": -58.51555252075195, + "loss": 0.686, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -0.04506484419107437, + "rewards/margins": 0.01579303853213787, + "rewards/rejected": -0.06085788086056709, + "step": 1390 + }, + { + "epoch": 0.2412129565816678, + "grad_norm": 2.1747207641601562, + "learning_rate": 8.036739380022962e-08, + "logits/chosen": -2.8493809700012207, + "logits/rejected": -2.831693410873413, + "logps/chosen": -61.77557373046875, + "logps/rejected": -58.64817428588867, + "loss": 0.6844, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -0.04902204871177673, + "rewards/margins": 0.018787894397974014, + "rewards/rejected": -0.06780994683504105, + "step": 1400 + }, + { + "epoch": 0.24293590627153688, + "grad_norm": 2.0125632286071777, + "learning_rate": 8.09414466130884e-08, + "logits/chosen": -2.88521671295166, + "logits/rejected": -2.8787474632263184, + "logps/chosen": -59.67738723754883, + "logps/rejected": -59.01367950439453, + "loss": 0.6878, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -0.052916985005140305, + "rewards/margins": 0.01177090872079134, + "rewards/rejected": -0.06468789279460907, + "step": 1410 + }, + { + "epoch": 0.24465885596140594, + "grad_norm": 2.345404624938965, + "learning_rate": 8.151549942594719e-08, + "logits/chosen": -2.836278200149536, + "logits/rejected": -2.840115547180176, + "logps/chosen": -57.09819412231445, + "logps/rejected": -60.34763717651367, + "loss": 0.6927, + "rewards/accuracies": 0.5562499761581421, + "rewards/chosen": -0.05967681482434273, + "rewards/margins": 0.002343767322599888, + "rewards/rejected": -0.06202058121562004, + "step": 1420 + }, + { + "epoch": 0.246381805651275, + "grad_norm": 2.3286633491516113, + "learning_rate": 8.208955223880598e-08, + "logits/chosen": -2.910238742828369, + "logits/rejected": -2.9011566638946533, + "logps/chosen": -60.003883361816406, + "logps/rejected": -61.60279083251953, + "loss": 0.687, + "rewards/accuracies": 0.5375000238418579, + "rewards/chosen": -0.05464918166399002, + "rewards/margins": 0.013536456041038036, + "rewards/rejected": -0.06818564236164093, + "step": 1430 + }, + { + "epoch": 0.24810475534114404, + "grad_norm": 2.111067295074463, + "learning_rate": 8.266360505166475e-08, + "logits/chosen": -2.8533780574798584, + "logits/rejected": -2.834259510040283, + "logps/chosen": -57.13257598876953, + "logps/rejected": -54.819725036621094, + "loss": 0.6852, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": -0.05546184256672859, + "rewards/margins": 0.01728229410946369, + "rewards/rejected": -0.07274413853883743, + "step": 1440 + }, + { + "epoch": 0.2498277050310131, + "grad_norm": 2.4927828311920166, + "learning_rate": 8.323765786452354e-08, + "logits/chosen": -2.8578834533691406, + "logits/rejected": -2.828831911087036, + "logps/chosen": -60.613731384277344, + "logps/rejected": -59.09722900390625, + "loss": 0.6838, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -0.049021851271390915, + "rewards/margins": 0.02033841609954834, + "rewards/rejected": -0.06936026364564896, + "step": 1450 + }, + { + "epoch": 0.25155065472088217, + "grad_norm": 2.4668631553649902, + "learning_rate": 8.381171067738232e-08, + "logits/chosen": -2.876624584197998, + "logits/rejected": -2.8503682613372803, + "logps/chosen": -59.0427360534668, + "logps/rejected": -57.6472053527832, + "loss": 0.6832, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.05816395953297615, + "rewards/margins": 0.021529266610741615, + "rewards/rejected": -0.07969322055578232, + "step": 1460 + }, + { + "epoch": 0.2532736044107512, + "grad_norm": 2.469766616821289, + "learning_rate": 8.43857634902411e-08, + "logits/chosen": -2.9928104877471924, + "logits/rejected": -2.9630894660949707, + "logps/chosen": -63.2796516418457, + "logps/rejected": -61.028541564941406, + "loss": 0.6829, + "rewards/accuracies": 0.606249988079071, + "rewards/chosen": -0.04897930100560188, + "rewards/margins": 0.02178030088543892, + "rewards/rejected": -0.0707595944404602, + "step": 1470 + }, + { + "epoch": 0.2549965541006203, + "grad_norm": 2.3830063343048096, + "learning_rate": 8.495981630309988e-08, + "logits/chosen": -2.944361448287964, + "logits/rejected": -2.914533853530884, + "logps/chosen": -57.114402770996094, + "logps/rejected": -58.12091064453125, + "loss": 0.6848, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -0.049399979412555695, + "rewards/margins": 0.017925377935171127, + "rewards/rejected": -0.06732536852359772, + "step": 1480 + }, + { + "epoch": 0.2567195037904893, + "grad_norm": 2.330536365509033, + "learning_rate": 8.553386911595867e-08, + "logits/chosen": -2.927044630050659, + "logits/rejected": -2.90609073638916, + "logps/chosen": -63.553245544433594, + "logps/rejected": -59.14959716796875, + "loss": 0.6855, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": -0.05048053339123726, + "rewards/margins": 0.016581468284130096, + "rewards/rejected": -0.06706200540065765, + "step": 1490 + }, + { + "epoch": 0.2584424534803584, + "grad_norm": 2.8656115531921387, + "learning_rate": 8.610792192881746e-08, + "logits/chosen": -2.8543686866760254, + "logits/rejected": -2.853294610977173, + "logps/chosen": -57.01494216918945, + "logps/rejected": -59.04634475708008, + "loss": 0.6882, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": -0.05923793837428093, + "rewards/margins": 0.011348679661750793, + "rewards/rejected": -0.07058661431074142, + "step": 1500 + }, + { + "epoch": 0.2601654031702274, + "grad_norm": 2.5738108158111572, + "learning_rate": 8.668197474167623e-08, + "logits/chosen": -2.8481411933898926, + "logits/rejected": -2.822608470916748, + "logps/chosen": -63.966583251953125, + "logps/rejected": -60.66066360473633, + "loss": 0.6796, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -0.04969775304198265, + "rewards/margins": 0.02851223386824131, + "rewards/rejected": -0.07820998132228851, + "step": 1510 + }, + { + "epoch": 0.2618883528600965, + "grad_norm": 2.3993465900421143, + "learning_rate": 8.725602755453502e-08, + "logits/chosen": -2.9035634994506836, + "logits/rejected": -2.901787519454956, + "logps/chosen": -59.19148635864258, + "logps/rejected": -57.19902420043945, + "loss": 0.6905, + "rewards/accuracies": 0.5249999761581421, + "rewards/chosen": -0.05953991413116455, + "rewards/margins": 0.006711998488754034, + "rewards/rejected": -0.06625191122293472, + "step": 1520 + }, + { + "epoch": 0.26361130254996556, + "grad_norm": 2.638784408569336, + "learning_rate": 8.78300803673938e-08, + "logits/chosen": -2.8348419666290283, + "logits/rejected": -2.822012424468994, + "logps/chosen": -58.7889518737793, + "logps/rejected": -56.48737716674805, + "loss": 0.6879, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -0.059633769094944, + "rewards/margins": 0.012106634676456451, + "rewards/rejected": -0.07174040377140045, + "step": 1530 + }, + { + "epoch": 0.2653342522398346, + "grad_norm": 2.5973594188690186, + "learning_rate": 8.840413318025258e-08, + "logits/chosen": -2.913525342941284, + "logits/rejected": -2.8881072998046875, + "logps/chosen": -62.00640106201172, + "logps/rejected": -58.09117889404297, + "loss": 0.6821, + "rewards/accuracies": 0.606249988079071, + "rewards/chosen": -0.05526846647262573, + "rewards/margins": 0.023841742426156998, + "rewards/rejected": -0.07911021262407303, + "step": 1540 + }, + { + "epoch": 0.26705720192970367, + "grad_norm": 2.4358787536621094, + "learning_rate": 8.897818599311136e-08, + "logits/chosen": -2.847963333129883, + "logits/rejected": -2.8368639945983887, + "logps/chosen": -60.75475311279297, + "logps/rejected": -57.487464904785156, + "loss": 0.6832, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.059478919953107834, + "rewards/margins": 0.02126147784292698, + "rewards/rejected": -0.08074040710926056, + "step": 1550 + }, + { + "epoch": 0.2687801516195727, + "grad_norm": 2.2986135482788086, + "learning_rate": 8.955223880597015e-08, + "logits/chosen": -2.8618578910827637, + "logits/rejected": -2.8462741374969482, + "logps/chosen": -57.96033477783203, + "logps/rejected": -57.77971267700195, + "loss": 0.6864, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -0.06349469721317291, + "rewards/margins": 0.01494310237467289, + "rewards/rejected": -0.07843779027462006, + "step": 1560 + }, + { + "epoch": 0.2705031013094418, + "grad_norm": 2.231766939163208, + "learning_rate": 9.012629161882894e-08, + "logits/chosen": -2.901459217071533, + "logits/rejected": -2.906397581100464, + "logps/chosen": -59.78357696533203, + "logps/rejected": -63.125389099121094, + "loss": 0.6896, + "rewards/accuracies": 0.5062500238418579, + "rewards/chosen": -0.06565506756305695, + "rewards/margins": 0.008577165193855762, + "rewards/rejected": -0.07423223555088043, + "step": 1570 + }, + { + "epoch": 0.2722260509993108, + "grad_norm": 2.6459128856658936, + "learning_rate": 9.070034443168771e-08, + "logits/chosen": -2.8522191047668457, + "logits/rejected": -2.8468332290649414, + "logps/chosen": -59.195945739746094, + "logps/rejected": -61.25299072265625, + "loss": 0.6854, + "rewards/accuracies": 0.606249988079071, + "rewards/chosen": -0.06301302462816238, + "rewards/margins": 0.017189020290970802, + "rewards/rejected": -0.08020204305648804, + "step": 1580 + }, + { + "epoch": 0.2739490006891799, + "grad_norm": 2.9546637535095215, + "learning_rate": 9.12743972445465e-08, + "logits/chosen": -2.8846755027770996, + "logits/rejected": -2.8750617504119873, + "logps/chosen": -60.621368408203125, + "logps/rejected": -62.49888229370117, + "loss": 0.684, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.06025485321879387, + "rewards/margins": 0.01954682543873787, + "rewards/rejected": -0.07980167865753174, + "step": 1590 + }, + { + "epoch": 0.27567195037904896, + "grad_norm": 2.6051409244537354, + "learning_rate": 9.184845005740528e-08, + "logits/chosen": -2.8711392879486084, + "logits/rejected": -2.8471274375915527, + "logps/chosen": -59.82111740112305, + "logps/rejected": -59.49725341796875, + "loss": 0.6835, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.06477855890989304, + "rewards/margins": 0.02124771662056446, + "rewards/rejected": -0.08602626621723175, + "step": 1600 + }, + { + "epoch": 0.27567195037904896, + "eval_logits/chosen": -2.9442405700683594, + "eval_logits/rejected": -2.9410126209259033, + "eval_logps/chosen": -62.226593017578125, + "eval_logps/rejected": -67.10498046875, + "eval_loss": 0.6880334615707397, + "eval_rewards/accuracies": 0.5764405131340027, + "eval_rewards/chosen": -0.032111138105392456, + "eval_rewards/margins": 0.011442671529948711, + "eval_rewards/rejected": -0.04355381056666374, + "eval_runtime": 382.9622, + "eval_samples_per_second": 11.239, + "eval_steps_per_second": 1.405, + "step": 1600 + }, + { + "epoch": 0.277394900068918, + "grad_norm": 2.2450082302093506, + "learning_rate": 9.242250287026406e-08, + "logits/chosen": -2.891284942626953, + "logits/rejected": -2.8724706172943115, + "logps/chosen": -63.180908203125, + "logps/rejected": -65.12251281738281, + "loss": 0.6863, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -0.07282154262065887, + "rewards/margins": 0.015445959754288197, + "rewards/rejected": -0.0882674977183342, + "step": 1610 + }, + { + "epoch": 0.27911784975878706, + "grad_norm": 2.4446041584014893, + "learning_rate": 9.299655568312284e-08, + "logits/chosen": -2.830038547515869, + "logits/rejected": -2.8137710094451904, + "logps/chosen": -60.38079833984375, + "logps/rejected": -62.97710418701172, + "loss": 0.685, + "rewards/accuracies": 0.5375000238418579, + "rewards/chosen": -0.07806779444217682, + "rewards/margins": 0.018283111974596977, + "rewards/rejected": -0.09635090827941895, + "step": 1620 + }, + { + "epoch": 0.2808407994486561, + "grad_norm": 2.6224751472473145, + "learning_rate": 9.357060849598163e-08, + "logits/chosen": -2.875746250152588, + "logits/rejected": -2.8608651161193848, + "logps/chosen": -63.53081512451172, + "logps/rejected": -59.80907440185547, + "loss": 0.6826, + "rewards/accuracies": 0.5562499761581421, + "rewards/chosen": -0.06455497443675995, + "rewards/margins": 0.023133311420679092, + "rewards/rejected": -0.08768828958272934, + "step": 1630 + }, + { + "epoch": 0.28256374913852517, + "grad_norm": 2.5814969539642334, + "learning_rate": 9.414466130884042e-08, + "logits/chosen": -2.9426326751708984, + "logits/rejected": -2.9164793491363525, + "logps/chosen": -64.81478881835938, + "logps/rejected": -59.52144241333008, + "loss": 0.6826, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -0.06653226912021637, + "rewards/margins": 0.023489978164434433, + "rewards/rejected": -0.09002223610877991, + "step": 1640 + }, + { + "epoch": 0.2842866988283942, + "grad_norm": 2.584890604019165, + "learning_rate": 9.471871412169919e-08, + "logits/chosen": -2.912461519241333, + "logits/rejected": -2.8958592414855957, + "logps/chosen": -64.74848937988281, + "logps/rejected": -63.9894905090332, + "loss": 0.6821, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.06611146032810211, + "rewards/margins": 0.02439257875084877, + "rewards/rejected": -0.09050404280424118, + "step": 1650 + }, + { + "epoch": 0.28600964851826327, + "grad_norm": 2.513695240020752, + "learning_rate": 9.529276693455798e-08, + "logits/chosen": -2.90999436378479, + "logits/rejected": -2.881359577178955, + "logps/chosen": -63.374961853027344, + "logps/rejected": -58.766632080078125, + "loss": 0.6795, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -0.07055828720331192, + "rewards/margins": 0.029589012265205383, + "rewards/rejected": -0.1001473069190979, + "step": 1660 + }, + { + "epoch": 0.2877325982081323, + "grad_norm": 2.6238155364990234, + "learning_rate": 9.586681974741676e-08, + "logits/chosen": -2.9074604511260986, + "logits/rejected": -2.892113447189331, + "logps/chosen": -63.04148483276367, + "logps/rejected": -63.07508087158203, + "loss": 0.6839, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.0675562173128128, + "rewards/margins": 0.02040962502360344, + "rewards/rejected": -0.08796583116054535, + "step": 1670 + }, + { + "epoch": 0.2894555478980014, + "grad_norm": 2.5961592197418213, + "learning_rate": 9.644087256027555e-08, + "logits/chosen": -2.881363868713379, + "logits/rejected": -2.8643298149108887, + "logps/chosen": -65.00291442871094, + "logps/rejected": -63.203407287597656, + "loss": 0.6855, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -0.08005902916193008, + "rewards/margins": 0.017578277736902237, + "rewards/rejected": -0.09763731062412262, + "step": 1680 + }, + { + "epoch": 0.29117849758787046, + "grad_norm": 2.4997031688690186, + "learning_rate": 9.701492537313432e-08, + "logits/chosen": -2.8670291900634766, + "logits/rejected": -2.857682466506958, + "logps/chosen": -63.41624069213867, + "logps/rejected": -59.528472900390625, + "loss": 0.6907, + "rewards/accuracies": 0.5249999761581421, + "rewards/chosen": -0.07418099790811539, + "rewards/margins": 0.0065845223143696785, + "rewards/rejected": -0.08076552301645279, + "step": 1690 + }, + { + "epoch": 0.2929014472777395, + "grad_norm": 2.485443592071533, + "learning_rate": 9.758897818599311e-08, + "logits/chosen": -2.823608636856079, + "logits/rejected": -2.8208167552948, + "logps/chosen": -58.42906951904297, + "logps/rejected": -63.24658203125, + "loss": 0.6896, + "rewards/accuracies": 0.59375, + "rewards/chosen": -0.0760272890329361, + "rewards/margins": 0.00900364015251398, + "rewards/rejected": -0.0850309282541275, + "step": 1700 + }, + { + "epoch": 0.29462439696760856, + "grad_norm": 2.994943857192993, + "learning_rate": 9.81630309988519e-08, + "logits/chosen": -2.8936543464660645, + "logits/rejected": -2.863755226135254, + "logps/chosen": -65.0849838256836, + "logps/rejected": -62.181243896484375, + "loss": 0.6805, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -0.0581812784075737, + "rewards/margins": 0.027432983741164207, + "rewards/rejected": -0.08561427146196365, + "step": 1710 + }, + { + "epoch": 0.2963473466574776, + "grad_norm": 2.805964708328247, + "learning_rate": 9.873708381171067e-08, + "logits/chosen": -2.874479293823242, + "logits/rejected": -2.8514676094055176, + "logps/chosen": -64.06068420410156, + "logps/rejected": -58.78630447387695, + "loss": 0.6855, + "rewards/accuracies": 0.59375, + "rewards/chosen": -0.0691644698381424, + "rewards/margins": 0.017094207927584648, + "rewards/rejected": -0.08625867962837219, + "step": 1720 + }, + { + "epoch": 0.29807029634734666, + "grad_norm": 2.502676486968994, + "learning_rate": 9.931113662456946e-08, + "logits/chosen": -2.903549909591675, + "logits/rejected": -2.8940093517303467, + "logps/chosen": -62.865089416503906, + "logps/rejected": -60.605796813964844, + "loss": 0.6892, + "rewards/accuracies": 0.512499988079071, + "rewards/chosen": -0.07542423903942108, + "rewards/margins": 0.01010823529213667, + "rewards/rejected": -0.08553248643875122, + "step": 1730 + }, + { + "epoch": 0.2997932460372157, + "grad_norm": 2.5997302532196045, + "learning_rate": 9.988518943742824e-08, + "logits/chosen": -2.8790831565856934, + "logits/rejected": -2.8717141151428223, + "logps/chosen": -60.32866668701172, + "logps/rejected": -63.63134765625, + "loss": 0.6829, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.07663007080554962, + "rewards/margins": 0.022682690992951393, + "rewards/rejected": -0.09931276738643646, + "step": 1740 + }, + { + "epoch": 0.30151619572708477, + "grad_norm": 2.848219633102417, + "learning_rate": 9.999993568953616e-08, + "logits/chosen": -2.877021312713623, + "logits/rejected": -2.870337724685669, + "logps/chosen": -62.94545364379883, + "logps/rejected": -64.73450469970703, + "loss": 0.6853, + "rewards/accuracies": 0.5562499761581421, + "rewards/chosen": -0.06983025372028351, + "rewards/margins": 0.017571711912751198, + "rewards/rejected": -0.08740197122097015, + "step": 1750 + }, + { + "epoch": 0.30323914541695385, + "grad_norm": 2.517097234725952, + "learning_rate": 9.99996744285603e-08, + "logits/chosen": -2.8967089653015137, + "logits/rejected": -2.87041974067688, + "logps/chosen": -64.09073638916016, + "logps/rejected": -61.933387756347656, + "loss": 0.6799, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.07073361426591873, + "rewards/margins": 0.028766438364982605, + "rewards/rejected": -0.09950004518032074, + "step": 1760 + }, + { + "epoch": 0.3049620951068229, + "grad_norm": 2.3977444171905518, + "learning_rate": 9.999921219871774e-08, + "logits/chosen": -2.8966360092163086, + "logits/rejected": -2.8674557209014893, + "logps/chosen": -63.9361572265625, + "logps/rejected": -58.58233642578125, + "loss": 0.6789, + "rewards/accuracies": 0.65625, + "rewards/chosen": -0.07565224170684814, + "rewards/margins": 0.030777927488088608, + "rewards/rejected": -0.10643017292022705, + "step": 1770 + }, + { + "epoch": 0.30668504479669195, + "grad_norm": 3.279787540435791, + "learning_rate": 9.99985490018664e-08, + "logits/chosen": -2.8627562522888184, + "logits/rejected": -2.863924741744995, + "logps/chosen": -62.496971130371094, + "logps/rejected": -67.43034362792969, + "loss": 0.6822, + "rewards/accuracies": 0.606249988079071, + "rewards/chosen": -0.07575863599777222, + "rewards/margins": 0.024096451699733734, + "rewards/rejected": -0.09985508024692535, + "step": 1780 + }, + { + "epoch": 0.308407994486561, + "grad_norm": 2.9971201419830322, + "learning_rate": 9.99976848406719e-08, + "logits/chosen": -2.9246413707733154, + "logits/rejected": -2.9111831188201904, + "logps/chosen": -60.837120056152344, + "logps/rejected": -62.72722244262695, + "loss": 0.6796, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.07828166335821152, + "rewards/margins": 0.029480207711458206, + "rewards/rejected": -0.10776188224554062, + "step": 1790 + }, + { + "epoch": 0.31013094417643006, + "grad_norm": 2.7353081703186035, + "learning_rate": 9.999661971860766e-08, + "logits/chosen": -2.9237895011901855, + "logits/rejected": -2.9061765670776367, + "logps/chosen": -61.31614303588867, + "logps/rejected": -62.551971435546875, + "loss": 0.6838, + "rewards/accuracies": 0.59375, + "rewards/chosen": -0.09031987190246582, + "rewards/margins": 0.020791074261069298, + "rewards/rejected": -0.11111094802618027, + "step": 1800 + }, + { + "epoch": 0.3118538938662991, + "grad_norm": 3.083889961242676, + "learning_rate": 9.999535363995486e-08, + "logits/chosen": -2.9037399291992188, + "logits/rejected": -2.8783864974975586, + "logps/chosen": -65.89918518066406, + "logps/rejected": -63.4612922668457, + "loss": 0.6836, + "rewards/accuracies": 0.59375, + "rewards/chosen": -0.08302709460258484, + "rewards/margins": 0.021477878093719482, + "rewards/rejected": -0.10450496524572372, + "step": 1810 + }, + { + "epoch": 0.31357684355616816, + "grad_norm": 2.7066895961761475, + "learning_rate": 9.999388660980235e-08, + "logits/chosen": -2.8926279544830322, + "logits/rejected": -2.8697822093963623, + "logps/chosen": -64.84931945800781, + "logps/rejected": -61.872032165527344, + "loss": 0.6836, + "rewards/accuracies": 0.5687500238418579, + "rewards/chosen": -0.0911707729101181, + "rewards/margins": 0.021647578105330467, + "rewards/rejected": -0.11281834542751312, + "step": 1820 + }, + { + "epoch": 0.31529979324603724, + "grad_norm": 2.6957552433013916, + "learning_rate": 9.999221863404672e-08, + "logits/chosen": -2.8292651176452637, + "logits/rejected": -2.8238108158111572, + "logps/chosen": -64.50049591064453, + "logps/rejected": -64.85762786865234, + "loss": 0.6837, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -0.08205254375934601, + "rewards/margins": 0.0213328804820776, + "rewards/rejected": -0.10338541120290756, + "step": 1830 + }, + { + "epoch": 0.31702274293590627, + "grad_norm": 2.7012877464294434, + "learning_rate": 9.999034971939226e-08, + "logits/chosen": -2.9520134925842285, + "logits/rejected": -2.947157621383667, + "logps/chosen": -66.78407287597656, + "logps/rejected": -66.19480895996094, + "loss": 0.6873, + "rewards/accuracies": 0.5562499761581421, + "rewards/chosen": -0.10865718126296997, + "rewards/margins": 0.01415513176470995, + "rewards/rejected": -0.12281231582164764, + "step": 1840 + }, + { + "epoch": 0.31874569262577535, + "grad_norm": 3.3216099739074707, + "learning_rate": 9.998827987335088e-08, + "logits/chosen": -2.8799540996551514, + "logits/rejected": -2.8814988136291504, + "logps/chosen": -65.33901977539062, + "logps/rejected": -65.83836364746094, + "loss": 0.6898, + "rewards/accuracies": 0.53125, + "rewards/chosen": -0.11164311319589615, + "rewards/margins": 0.009035361930727959, + "rewards/rejected": -0.12067846953868866, + "step": 1850 + }, + { + "epoch": 0.32046864231564437, + "grad_norm": 2.923114061355591, + "learning_rate": 9.998600910424211e-08, + "logits/chosen": -2.8394248485565186, + "logits/rejected": -2.816847324371338, + "logps/chosen": -67.88639068603516, + "logps/rejected": -64.48649597167969, + "loss": 0.6762, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.08513197302818298, + "rewards/margins": 0.036557335406541824, + "rewards/rejected": -0.12168930470943451, + "step": 1860 + }, + { + "epoch": 0.32219159200551345, + "grad_norm": 3.0153424739837646, + "learning_rate": 9.99835374211931e-08, + "logits/chosen": -2.8406834602355957, + "logits/rejected": -2.8223063945770264, + "logps/chosen": -65.89734649658203, + "logps/rejected": -64.9059066772461, + "loss": 0.6785, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.09063565731048584, + "rewards/margins": 0.03220418840646744, + "rewards/rejected": -0.12283984571695328, + "step": 1870 + }, + { + "epoch": 0.3239145416953825, + "grad_norm": 2.781935453414917, + "learning_rate": 9.998086483413856e-08, + "logits/chosen": -2.8889684677124023, + "logits/rejected": -2.86552095413208, + "logps/chosen": -63.65220260620117, + "logps/rejected": -63.5311393737793, + "loss": 0.6768, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.09271065145730972, + "rewards/margins": 0.03604603186249733, + "rewards/rejected": -0.12875667214393616, + "step": 1880 + }, + { + "epoch": 0.32563749138525155, + "grad_norm": 3.0753352642059326, + "learning_rate": 9.997799135382066e-08, + "logits/chosen": -2.8843913078308105, + "logits/rejected": -2.879636287689209, + "logps/chosen": -62.85695266723633, + "logps/rejected": -64.9014892578125, + "loss": 0.6829, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -0.09137921035289764, + "rewards/margins": 0.02333098277449608, + "rewards/rejected": -0.11471019685268402, + "step": 1890 + }, + { + "epoch": 0.32736044107512063, + "grad_norm": 2.8090198040008545, + "learning_rate": 9.997491699178911e-08, + "logits/chosen": -2.8752949237823486, + "logits/rejected": -2.854523181915283, + "logps/chosen": -66.43936157226562, + "logps/rejected": -62.73388671875, + "loss": 0.6775, + "rewards/accuracies": 0.606249988079071, + "rewards/chosen": -0.0817694216966629, + "rewards/margins": 0.034055549651384354, + "rewards/rejected": -0.11582496017217636, + "step": 1900 + }, + { + "epoch": 0.32908339076498966, + "grad_norm": 3.1820809841156006, + "learning_rate": 9.997164176040098e-08, + "logits/chosen": -2.796372175216675, + "logits/rejected": -2.771918773651123, + "logps/chosen": -64.6166763305664, + "logps/rejected": -66.13924407958984, + "loss": 0.6767, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.09831389784812927, + "rewards/margins": 0.03565414994955063, + "rewards/rejected": -0.1339680403470993, + "step": 1910 + }, + { + "epoch": 0.33080634045485874, + "grad_norm": 3.1440021991729736, + "learning_rate": 9.996816567282078e-08, + "logits/chosen": -2.853527545928955, + "logits/rejected": -2.8333311080932617, + "logps/chosen": -66.1489028930664, + "logps/rejected": -65.1463851928711, + "loss": 0.6801, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -0.0968015193939209, + "rewards/margins": 0.02934463880956173, + "rewards/rejected": -0.12614616751670837, + "step": 1920 + }, + { + "epoch": 0.33252929014472776, + "grad_norm": 2.9556357860565186, + "learning_rate": 9.996448874302028e-08, + "logits/chosen": -2.8163414001464844, + "logits/rejected": -2.7914609909057617, + "logps/chosen": -66.2785873413086, + "logps/rejected": -66.2317123413086, + "loss": 0.68, + "rewards/accuracies": 0.6187499761581421, + "rewards/chosen": -0.10773171484470367, + "rewards/margins": 0.02885953150689602, + "rewards/rejected": -0.13659124076366425, + "step": 1930 + }, + { + "epoch": 0.33425223983459684, + "grad_norm": 3.3226914405822754, + "learning_rate": 9.996061098577856e-08, + "logits/chosen": -2.8367037773132324, + "logits/rejected": -2.826648473739624, + "logps/chosen": -63.57377243041992, + "logps/rejected": -63.56377029418945, + "loss": 0.6829, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.11578904092311859, + "rewards/margins": 0.02378334477543831, + "rewards/rejected": -0.1395723819732666, + "step": 1940 + }, + { + "epoch": 0.33597518952446587, + "grad_norm": 2.77713680267334, + "learning_rate": 9.995653241668189e-08, + "logits/chosen": -2.861672878265381, + "logits/rejected": -2.859971046447754, + "logps/chosen": -64.96310424804688, + "logps/rejected": -68.37250518798828, + "loss": 0.6861, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.11934344470500946, + "rewards/margins": 0.01718161627650261, + "rewards/rejected": -0.13652506470680237, + "step": 1950 + }, + { + "epoch": 0.33769813921433495, + "grad_norm": 2.9392669200897217, + "learning_rate": 9.995225305212369e-08, + "logits/chosen": -2.8549649715423584, + "logits/rejected": -2.8416645526885986, + "logps/chosen": -66.29956817626953, + "logps/rejected": -67.89146423339844, + "loss": 0.6798, + "rewards/accuracies": 0.581250011920929, + "rewards/chosen": -0.11181751638650894, + "rewards/margins": 0.02995738759636879, + "rewards/rejected": -0.14177490770816803, + "step": 1960 + }, + { + "epoch": 0.33942108890420397, + "grad_norm": 2.8342604637145996, + "learning_rate": 9.994777290930442e-08, + "logits/chosen": -2.8708348274230957, + "logits/rejected": -2.8521673679351807, + "logps/chosen": -68.05824279785156, + "logps/rejected": -64.81471252441406, + "loss": 0.6819, + "rewards/accuracies": 0.5687500238418579, + "rewards/chosen": -0.10849817097187042, + "rewards/margins": 0.02602517604827881, + "rewards/rejected": -0.13452333211898804, + "step": 1970 + }, + { + "epoch": 0.34114403859407305, + "grad_norm": 3.1742191314697266, + "learning_rate": 9.994309200623163e-08, + "logits/chosen": -2.814614772796631, + "logits/rejected": -2.807882308959961, + "logps/chosen": -68.45671081542969, + "logps/rejected": -66.14873504638672, + "loss": 0.6907, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.12103524059057236, + "rewards/margins": 0.007765748538076878, + "rewards/rejected": -0.12880098819732666, + "step": 1980 + }, + { + "epoch": 0.34286698828394213, + "grad_norm": 2.822662353515625, + "learning_rate": 9.993821036171974e-08, + "logits/chosen": -2.8995895385742188, + "logits/rejected": -2.8736507892608643, + "logps/chosen": -63.92681884765625, + "logps/rejected": -63.00006866455078, + "loss": 0.6751, + "rewards/accuracies": 0.6187499761581421, + "rewards/chosen": -0.11917207390069962, + "rewards/margins": 0.039555564522743225, + "rewards/rejected": -0.15872761607170105, + "step": 1990 + }, + { + "epoch": 0.34458993797381116, + "grad_norm": 3.4786696434020996, + "learning_rate": 9.993312799539004e-08, + "logits/chosen": -2.8508896827697754, + "logits/rejected": -2.8577001094818115, + "logps/chosen": -63.96714401245117, + "logps/rejected": -71.07959747314453, + "loss": 0.6865, + "rewards/accuracies": 0.512499988079071, + "rewards/chosen": -0.13510540127754211, + "rewards/margins": 0.016423583030700684, + "rewards/rejected": -0.151528999209404, + "step": 2000 + }, + { + "epoch": 0.34458993797381116, + "eval_logits/chosen": -2.9192111492156982, + "eval_logits/rejected": -2.915797233581543, + "eval_logps/chosen": -65.91581726074219, + "eval_logps/rejected": -71.4878158569336, + "eval_loss": 0.685180127620697, + "eval_rewards/accuracies": 0.5713289976119995, + "eval_rewards/chosen": -0.06900343298912048, + "eval_rewards/margins": 0.018378695473074913, + "eval_rewards/rejected": -0.08738213777542114, + "eval_runtime": 382.8756, + "eval_samples_per_second": 11.241, + "eval_steps_per_second": 1.405, + "step": 2000 + }, + { + "epoch": 0.34631288766368024, + "grad_norm": 3.048440456390381, + "learning_rate": 9.992784492767061e-08, + "logits/chosen": -2.8806567192077637, + "logits/rejected": -2.8591017723083496, + "logps/chosen": -67.69590759277344, + "logps/rejected": -64.71334075927734, + "loss": 0.6796, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.10972080379724503, + "rewards/margins": 0.030075108632445335, + "rewards/rejected": -0.1397959142923355, + "step": 2010 + }, + { + "epoch": 0.34803583735354926, + "grad_norm": 3.0328872203826904, + "learning_rate": 9.992236117979623e-08, + "logits/chosen": -2.847768545150757, + "logits/rejected": -2.834040403366089, + "logps/chosen": -62.909156799316406, + "logps/rejected": -69.29357147216797, + "loss": 0.6768, + "rewards/accuracies": 0.6187499761581421, + "rewards/chosen": -0.11996938288211823, + "rewards/margins": 0.0360104963183403, + "rewards/rejected": -0.15597985684871674, + "step": 2020 + }, + { + "epoch": 0.34975878704341834, + "grad_norm": 5.03533935546875, + "learning_rate": 9.991667677380831e-08, + "logits/chosen": -2.917463779449463, + "logits/rejected": -2.9009978771209717, + "logps/chosen": -68.2878189086914, + "logps/rejected": -68.7306137084961, + "loss": 0.6816, + "rewards/accuracies": 0.6187499761581421, + "rewards/chosen": -0.10773104429244995, + "rewards/margins": 0.026072192937135696, + "rewards/rejected": -0.13380321860313416, + "step": 2030 + }, + { + "epoch": 0.35148173673328736, + "grad_norm": 2.9555203914642334, + "learning_rate": 9.991079173255476e-08, + "logits/chosen": -2.817652702331543, + "logits/rejected": -2.8122856616973877, + "logps/chosen": -65.14109802246094, + "logps/rejected": -66.4891357421875, + "loss": 0.6815, + "rewards/accuracies": 0.5249999761581421, + "rewards/chosen": -0.10871319472789764, + "rewards/margins": 0.02643074095249176, + "rewards/rejected": -0.1351439505815506, + "step": 2040 + }, + { + "epoch": 0.35320468642315644, + "grad_norm": 3.105513572692871, + "learning_rate": 9.990470607968994e-08, + "logits/chosen": -2.886338233947754, + "logits/rejected": -2.8689470291137695, + "logps/chosen": -62.5875358581543, + "logps/rejected": -66.26753234863281, + "loss": 0.6845, + "rewards/accuracies": 0.518750011920929, + "rewards/chosen": -0.10509078204631805, + "rewards/margins": 0.02006548084318638, + "rewards/rejected": -0.1251562535762787, + "step": 2050 + }, + { + "epoch": 0.3549276361130255, + "grad_norm": 3.6310055255889893, + "learning_rate": 9.989841983967456e-08, + "logits/chosen": -2.8897366523742676, + "logits/rejected": -2.8637399673461914, + "logps/chosen": -65.2972183227539, + "logps/rejected": -64.59178161621094, + "loss": 0.6762, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -0.09557583928108215, + "rewards/margins": 0.0371161587536335, + "rewards/rejected": -0.13269200921058655, + "step": 2060 + }, + { + "epoch": 0.35665058580289455, + "grad_norm": 2.7718076705932617, + "learning_rate": 9.989193303777551e-08, + "logits/chosen": -2.903705596923828, + "logits/rejected": -2.8876516819000244, + "logps/chosen": -66.88301086425781, + "logps/rejected": -67.48914337158203, + "loss": 0.6812, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.10493052005767822, + "rewards/margins": 0.027421722188591957, + "rewards/rejected": -0.13235223293304443, + "step": 2070 + }, + { + "epoch": 0.35837353549276363, + "grad_norm": 3.4504640102386475, + "learning_rate": 9.988524570006591e-08, + "logits/chosen": -2.839569091796875, + "logits/rejected": -2.8218274116516113, + "logps/chosen": -64.51222229003906, + "logps/rejected": -65.25495910644531, + "loss": 0.6755, + "rewards/accuracies": 0.606249988079071, + "rewards/chosen": -0.11703022569417953, + "rewards/margins": 0.038812413811683655, + "rewards/rejected": -0.1558426320552826, + "step": 2080 + }, + { + "epoch": 0.36009648518263265, + "grad_norm": 3.131420612335205, + "learning_rate": 9.987835785342484e-08, + "logits/chosen": -2.870603561401367, + "logits/rejected": -2.8721537590026855, + "logps/chosen": -67.33354187011719, + "logps/rejected": -67.72920989990234, + "loss": 0.6841, + "rewards/accuracies": 0.59375, + "rewards/chosen": -0.11783289909362793, + "rewards/margins": 0.020953617990016937, + "rewards/rejected": -0.13878652453422546, + "step": 2090 + }, + { + "epoch": 0.36181943487250173, + "grad_norm": 3.2706260681152344, + "learning_rate": 9.987126952553735e-08, + "logits/chosen": -2.834599256515503, + "logits/rejected": -2.8189969062805176, + "logps/chosen": -68.30904388427734, + "logps/rejected": -65.57106018066406, + "loss": 0.6791, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -0.12041591107845306, + "rewards/margins": 0.03133242577314377, + "rewards/rejected": -0.15174834430217743, + "step": 2100 + }, + { + "epoch": 0.36354238456237076, + "grad_norm": 3.030902862548828, + "learning_rate": 9.986398074489428e-08, + "logits/chosen": -2.8516478538513184, + "logits/rejected": -2.8527791500091553, + "logps/chosen": -63.29868698120117, + "logps/rejected": -68.52655792236328, + "loss": 0.6874, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": -0.1283387690782547, + "rewards/margins": 0.015200036577880383, + "rewards/rejected": -0.14353878796100616, + "step": 2110 + }, + { + "epoch": 0.36526533425223984, + "grad_norm": 3.139935255050659, + "learning_rate": 9.985649154079221e-08, + "logits/chosen": -2.8046183586120605, + "logits/rejected": -2.786285400390625, + "logps/chosen": -65.4697036743164, + "logps/rejected": -64.31707000732422, + "loss": 0.6797, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.10919544845819473, + "rewards/margins": 0.030463799834251404, + "rewards/rejected": -0.13965924084186554, + "step": 2120 + }, + { + "epoch": 0.3669882839421089, + "grad_norm": 3.0774590969085693, + "learning_rate": 9.984880194333322e-08, + "logits/chosen": -2.808724880218506, + "logits/rejected": -2.79413104057312, + "logps/chosen": -67.2655258178711, + "logps/rejected": -68.5387191772461, + "loss": 0.6719, + "rewards/accuracies": 0.581250011920929, + "rewards/chosen": -0.12331392616033554, + "rewards/margins": 0.04621954262256622, + "rewards/rejected": -0.16953346133232117, + "step": 2130 + }, + { + "epoch": 0.36871123363197794, + "grad_norm": 3.1437113285064697, + "learning_rate": 9.984091198342495e-08, + "logits/chosen": -2.794424533843994, + "logits/rejected": -2.7891032695770264, + "logps/chosen": -63.76580047607422, + "logps/rejected": -69.71234130859375, + "loss": 0.6809, + "rewards/accuracies": 0.606249988079071, + "rewards/chosen": -0.14377468824386597, + "rewards/margins": 0.02725168690085411, + "rewards/rejected": -0.17102637887001038, + "step": 2140 + }, + { + "epoch": 0.370434183321847, + "grad_norm": 3.1667582988739014, + "learning_rate": 9.983282169278032e-08, + "logits/chosen": -2.8194727897644043, + "logits/rejected": -2.787461042404175, + "logps/chosen": -68.21852111816406, + "logps/rejected": -64.78960418701172, + "loss": 0.6677, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -0.11504881083965302, + "rewards/margins": 0.05498508736491203, + "rewards/rejected": -0.17003390192985535, + "step": 2150 + }, + { + "epoch": 0.37215713301171605, + "grad_norm": 3.9630634784698486, + "learning_rate": 9.982453110391746e-08, + "logits/chosen": -2.8034210205078125, + "logits/rejected": -2.7924370765686035, + "logps/chosen": -69.6778335571289, + "logps/rejected": -64.76737976074219, + "loss": 0.6812, + "rewards/accuracies": 0.59375, + "rewards/chosen": -0.13194985687732697, + "rewards/margins": 0.027023714035749435, + "rewards/rejected": -0.1589735895395279, + "step": 2160 + }, + { + "epoch": 0.3738800827015851, + "grad_norm": 3.4963889122009277, + "learning_rate": 9.981604025015961e-08, + "logits/chosen": -2.8941502571105957, + "logits/rejected": -2.882629871368408, + "logps/chosen": -67.54387664794922, + "logps/rejected": -67.07762145996094, + "loss": 0.6756, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.1255626529455185, + "rewards/margins": 0.03818827494978905, + "rewards/rejected": -0.16375091671943665, + "step": 2170 + }, + { + "epoch": 0.37560303239145415, + "grad_norm": 3.656014919281006, + "learning_rate": 9.980734916563493e-08, + "logits/chosen": -2.9170708656311035, + "logits/rejected": -2.892664909362793, + "logps/chosen": -73.61463165283203, + "logps/rejected": -70.44910430908203, + "loss": 0.6746, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -0.14075477421283722, + "rewards/margins": 0.041772354394197464, + "rewards/rejected": -0.1825271099805832, + "step": 2180 + }, + { + "epoch": 0.37732598208132323, + "grad_norm": 3.52813720703125, + "learning_rate": 9.97984578852764e-08, + "logits/chosen": -2.9582104682922363, + "logits/rejected": -2.9556591510772705, + "logps/chosen": -66.61128997802734, + "logps/rejected": -68.55953979492188, + "loss": 0.6844, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.1338820457458496, + "rewards/margins": 0.020972993224859238, + "rewards/rejected": -0.15485504269599915, + "step": 2190 + }, + { + "epoch": 0.37904893177119225, + "grad_norm": 4.082705497741699, + "learning_rate": 9.978936644482165e-08, + "logits/chosen": -2.8421432971954346, + "logits/rejected": -2.829256534576416, + "logps/chosen": -65.4125747680664, + "logps/rejected": -70.35450744628906, + "loss": 0.6735, + "rewards/accuracies": 0.59375, + "rewards/chosen": -0.12541845440864563, + "rewards/margins": 0.043675925582647324, + "rewards/rejected": -0.16909436881542206, + "step": 2200 + }, + { + "epoch": 0.38077188146106133, + "grad_norm": 3.55794620513916, + "learning_rate": 9.978007488081286e-08, + "logits/chosen": -2.8825392723083496, + "logits/rejected": -2.8723480701446533, + "logps/chosen": -68.32068634033203, + "logps/rejected": -68.001220703125, + "loss": 0.6797, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.12466458976268768, + "rewards/margins": 0.031077438965439796, + "rewards/rejected": -0.15574204921722412, + "step": 2210 + }, + { + "epoch": 0.3824948311509304, + "grad_norm": 3.548194169998169, + "learning_rate": 9.977058323059658e-08, + "logits/chosen": -2.763171672821045, + "logits/rejected": -2.756930112838745, + "logps/chosen": -67.42240142822266, + "logps/rejected": -68.9684066772461, + "loss": 0.6809, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -0.12630632519721985, + "rewards/margins": 0.0291056577116251, + "rewards/rejected": -0.1554119884967804, + "step": 2220 + }, + { + "epoch": 0.38421778084079944, + "grad_norm": 4.489490509033203, + "learning_rate": 9.976089153232354e-08, + "logits/chosen": -2.8702690601348877, + "logits/rejected": -2.849177598953247, + "logps/chosen": -68.55046844482422, + "logps/rejected": -66.85404968261719, + "loss": 0.6804, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -0.14111894369125366, + "rewards/margins": 0.02869265154004097, + "rewards/rejected": -0.16981160640716553, + "step": 2230 + }, + { + "epoch": 0.3859407305306685, + "grad_norm": 3.8217995166778564, + "learning_rate": 9.975099982494864e-08, + "logits/chosen": -2.8618292808532715, + "logits/rejected": -2.8326847553253174, + "logps/chosen": -70.48759460449219, + "logps/rejected": -69.0060806274414, + "loss": 0.6802, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -0.13084883987903595, + "rewards/margins": 0.02972751297056675, + "rewards/rejected": -0.16057637333869934, + "step": 2240 + }, + { + "epoch": 0.38766368022053754, + "grad_norm": 3.6844048500061035, + "learning_rate": 9.974090814823062e-08, + "logits/chosen": -2.8011302947998047, + "logits/rejected": -2.774014711380005, + "logps/chosen": -69.90562438964844, + "logps/rejected": -68.50813293457031, + "loss": 0.6814, + "rewards/accuracies": 0.6187499761581421, + "rewards/chosen": -0.13328006863594055, + "rewards/margins": 0.026850569993257523, + "rewards/rejected": -0.16013064980506897, + "step": 2250 + }, + { + "epoch": 0.3893866299104066, + "grad_norm": 3.8670570850372314, + "learning_rate": 9.9730616542732e-08, + "logits/chosen": -2.805306911468506, + "logits/rejected": -2.7785191535949707, + "logps/chosen": -74.76165771484375, + "logps/rejected": -73.67105102539062, + "loss": 0.6778, + "rewards/accuracies": 0.581250011920929, + "rewards/chosen": -0.13669700920581818, + "rewards/margins": 0.034947678446769714, + "rewards/rejected": -0.1716446876525879, + "step": 2260 + }, + { + "epoch": 0.39110957960027565, + "grad_norm": 4.4944047927856445, + "learning_rate": 9.972012504981892e-08, + "logits/chosen": -2.795133113861084, + "logits/rejected": -2.76835560798645, + "logps/chosen": -69.20185852050781, + "logps/rejected": -68.29955291748047, + "loss": 0.6784, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -0.1428747922182083, + "rewards/margins": 0.03427129238843918, + "rewards/rejected": -0.1771460920572281, + "step": 2270 + }, + { + "epoch": 0.3928325292901447, + "grad_norm": 3.459359645843506, + "learning_rate": 9.970943371166087e-08, + "logits/chosen": -2.8179843425750732, + "logits/rejected": -2.8150620460510254, + "logps/chosen": -67.28515625, + "logps/rejected": -70.26941680908203, + "loss": 0.683, + "rewards/accuracies": 0.5687500238418579, + "rewards/chosen": -0.12593597173690796, + "rewards/margins": 0.023327378556132317, + "rewards/rejected": -0.14926335215568542, + "step": 2280 + }, + { + "epoch": 0.3945554789800138, + "grad_norm": 3.351472854614258, + "learning_rate": 9.969854257123071e-08, + "logits/chosen": -2.7689754962921143, + "logits/rejected": -2.751616954803467, + "logps/chosen": -66.91730499267578, + "logps/rejected": -69.58160400390625, + "loss": 0.6754, + "rewards/accuracies": 0.6187499761581421, + "rewards/chosen": -0.13506877422332764, + "rewards/margins": 0.03949803113937378, + "rewards/rejected": -0.17456680536270142, + "step": 2290 + }, + { + "epoch": 0.39627842866988283, + "grad_norm": 3.6811389923095703, + "learning_rate": 9.968745167230428e-08, + "logits/chosen": -2.881148099899292, + "logits/rejected": -2.855869770050049, + "logps/chosen": -69.73558807373047, + "logps/rejected": -68.98726654052734, + "loss": 0.6736, + "rewards/accuracies": 0.6187499761581421, + "rewards/chosen": -0.11700799316167831, + "rewards/margins": 0.043230026960372925, + "rewards/rejected": -0.16023802757263184, + "step": 2300 + }, + { + "epoch": 0.3980013783597519, + "grad_norm": 3.873969078063965, + "learning_rate": 9.967616105946042e-08, + "logits/chosen": -2.8135409355163574, + "logits/rejected": -2.801391124725342, + "logps/chosen": -65.8260498046875, + "logps/rejected": -67.92823791503906, + "loss": 0.6757, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -0.13845619559288025, + "rewards/margins": 0.03907342627644539, + "rewards/rejected": -0.17752963304519653, + "step": 2310 + }, + { + "epoch": 0.39972432804962094, + "grad_norm": 3.8343992233276367, + "learning_rate": 9.966467077808063e-08, + "logits/chosen": -2.8660356998443604, + "logits/rejected": -2.833324432373047, + "logps/chosen": -71.3488540649414, + "logps/rejected": -69.46772003173828, + "loss": 0.6662, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -0.1275714635848999, + "rewards/margins": 0.05821261554956436, + "rewards/rejected": -0.18578408658504486, + "step": 2320 + }, + { + "epoch": 0.40144727773949, + "grad_norm": 3.9437179565429688, + "learning_rate": 9.965298087434898e-08, + "logits/chosen": -2.8199102878570557, + "logits/rejected": -2.8103442192077637, + "logps/chosen": -71.7131118774414, + "logps/rejected": -71.01388549804688, + "loss": 0.6686, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -0.14351196587085724, + "rewards/margins": 0.05403792858123779, + "rewards/rejected": -0.19754989445209503, + "step": 2330 + }, + { + "epoch": 0.40317022742935904, + "grad_norm": 4.509121417999268, + "learning_rate": 9.964109139525195e-08, + "logits/chosen": -2.8408901691436768, + "logits/rejected": -2.834322214126587, + "logps/chosen": -69.10721588134766, + "logps/rejected": -72.46755981445312, + "loss": 0.6859, + "rewards/accuracies": 0.5062500238418579, + "rewards/chosen": -0.15892067551612854, + "rewards/margins": 0.019664833322167397, + "rewards/rejected": -0.1785854995250702, + "step": 2340 + }, + { + "epoch": 0.4048931771192281, + "grad_norm": 3.775643825531006, + "learning_rate": 9.962900238857812e-08, + "logits/chosen": -2.8256442546844482, + "logits/rejected": -2.8138177394866943, + "logps/chosen": -71.73432922363281, + "logps/rejected": -72.02094268798828, + "loss": 0.6713, + "rewards/accuracies": 0.65625, + "rewards/chosen": -0.12964202463626862, + "rewards/margins": 0.04859600216150284, + "rewards/rejected": -0.17823801934719086, + "step": 2350 + }, + { + "epoch": 0.4066161268090972, + "grad_norm": 3.601081132888794, + "learning_rate": 9.96167139029181e-08, + "logits/chosen": -2.8538124561309814, + "logits/rejected": -2.8433425426483154, + "logps/chosen": -66.75715637207031, + "logps/rejected": -68.6590347290039, + "loss": 0.6775, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -0.14916284382343292, + "rewards/margins": 0.03558550402522087, + "rewards/rejected": -0.1847483515739441, + "step": 2360 + }, + { + "epoch": 0.4083390764989662, + "grad_norm": 4.274349689483643, + "learning_rate": 9.960422598766427e-08, + "logits/chosen": -2.8547940254211426, + "logits/rejected": -2.8508224487304688, + "logps/chosen": -71.09428405761719, + "logps/rejected": -73.29460906982422, + "loss": 0.6749, + "rewards/accuracies": 0.65625, + "rewards/chosen": -0.16236889362335205, + "rewards/margins": 0.03996804356575012, + "rewards/rejected": -0.20233693718910217, + "step": 2370 + }, + { + "epoch": 0.4100620261888353, + "grad_norm": 3.6479640007019043, + "learning_rate": 9.95915386930106e-08, + "logits/chosen": -2.8166608810424805, + "logits/rejected": -2.8011667728424072, + "logps/chosen": -69.60545349121094, + "logps/rejected": -70.36505126953125, + "loss": 0.678, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.14873133599758148, + "rewards/margins": 0.03490529954433441, + "rewards/rejected": -0.1836366355419159, + "step": 2380 + }, + { + "epoch": 0.41178497587870433, + "grad_norm": 4.576180934906006, + "learning_rate": 9.957865206995243e-08, + "logits/chosen": -2.8613834381103516, + "logits/rejected": -2.842647075653076, + "logps/chosen": -73.18601989746094, + "logps/rejected": -72.28538513183594, + "loss": 0.6719, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.16160975396633148, + "rewards/margins": 0.04711094871163368, + "rewards/rejected": -0.20872068405151367, + "step": 2390 + }, + { + "epoch": 0.4135079255685734, + "grad_norm": 3.625098943710327, + "learning_rate": 9.956556617028632e-08, + "logits/chosen": -2.8957629203796387, + "logits/rejected": -2.885424852371216, + "logps/chosen": -68.98573303222656, + "logps/rejected": -72.5274429321289, + "loss": 0.6767, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": -0.16070397198200226, + "rewards/margins": 0.037665095180273056, + "rewards/rejected": -0.19836905598640442, + "step": 2400 + }, + { + "epoch": 0.4135079255685734, + "eval_logits/chosen": -2.8938286304473877, + "eval_logits/rejected": -2.890582799911499, + "eval_logps/chosen": -69.88031768798828, + "eval_logps/rejected": -76.26507568359375, + "eval_loss": 0.681674063205719, + "eval_rewards/accuracies": 0.5815520286560059, + "eval_rewards/chosen": -0.108648382127285, + "eval_rewards/margins": 0.026506369933485985, + "eval_rewards/rejected": -0.13515476882457733, + "eval_runtime": 383.5029, + "eval_samples_per_second": 11.223, + "eval_steps_per_second": 1.403, + "step": 2400 + }, + { + "epoch": 0.41523087525844243, + "grad_norm": 3.9140706062316895, + "learning_rate": 9.955228104660978e-08, + "logits/chosen": -2.860299825668335, + "logits/rejected": -2.8347489833831787, + "logps/chosen": -71.68931579589844, + "logps/rejected": -69.27977752685547, + "loss": 0.6715, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -0.159579336643219, + "rewards/margins": 0.048062894493341446, + "rewards/rejected": -0.20764222741127014, + "step": 2410 + }, + { + "epoch": 0.4169538249483115, + "grad_norm": 3.6326847076416016, + "learning_rate": 9.953879675232106e-08, + "logits/chosen": -2.869041919708252, + "logits/rejected": -2.849848747253418, + "logps/chosen": -73.65645599365234, + "logps/rejected": -74.63279724121094, + "loss": 0.6765, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -0.1667543202638626, + "rewards/margins": 0.03787728026509285, + "rewards/rejected": -0.20463159680366516, + "step": 2420 + }, + { + "epoch": 0.41867677463818054, + "grad_norm": 3.5972187519073486, + "learning_rate": 9.952511334161901e-08, + "logits/chosen": -2.818706750869751, + "logits/rejected": -2.802964448928833, + "logps/chosen": -72.43135070800781, + "logps/rejected": -71.22188568115234, + "loss": 0.6795, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -0.16664886474609375, + "rewards/margins": 0.03179159015417099, + "rewards/rejected": -0.19844046235084534, + "step": 2430 + }, + { + "epoch": 0.4203997243280496, + "grad_norm": 6.047811985015869, + "learning_rate": 9.951123086950277e-08, + "logits/chosen": -2.8392794132232666, + "logits/rejected": -2.829869270324707, + "logps/chosen": -73.2491683959961, + "logps/rejected": -74.15043640136719, + "loss": 0.6764, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -0.16629108786582947, + "rewards/margins": 0.03869754448533058, + "rewards/rejected": -0.20498862862586975, + "step": 2440 + }, + { + "epoch": 0.4221226740179187, + "grad_norm": 4.199243545532227, + "learning_rate": 9.949714939177159e-08, + "logits/chosen": -2.8145751953125, + "logits/rejected": -2.797954559326172, + "logps/chosen": -72.74483489990234, + "logps/rejected": -74.14974212646484, + "loss": 0.674, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.18195998668670654, + "rewards/margins": 0.042485881596803665, + "rewards/rejected": -0.2244458943605423, + "step": 2450 + }, + { + "epoch": 0.4238456237077877, + "grad_norm": 3.3683722019195557, + "learning_rate": 9.94828689650246e-08, + "logits/chosen": -2.817074775695801, + "logits/rejected": -2.792792797088623, + "logps/chosen": -73.75148010253906, + "logps/rejected": -72.57868957519531, + "loss": 0.673, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -0.16400498151779175, + "rewards/margins": 0.045658182352781296, + "rewards/rejected": -0.20966319739818573, + "step": 2460 + }, + { + "epoch": 0.4255685733976568, + "grad_norm": 4.756073951721191, + "learning_rate": 9.946838964666062e-08, + "logits/chosen": -2.8950090408325195, + "logits/rejected": -2.8776695728302, + "logps/chosen": -72.48672485351562, + "logps/rejected": -73.16866302490234, + "loss": 0.6726, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -0.16018885374069214, + "rewards/margins": 0.04700163006782532, + "rewards/rejected": -0.20719048380851746, + "step": 2470 + }, + { + "epoch": 0.4272915230875258, + "grad_norm": 4.101457118988037, + "learning_rate": 9.945371149487787e-08, + "logits/chosen": -2.8252413272857666, + "logits/rejected": -2.802150011062622, + "logps/chosen": -73.47563171386719, + "logps/rejected": -71.92816925048828, + "loss": 0.677, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -0.18649457395076752, + "rewards/margins": 0.037734054028987885, + "rewards/rejected": -0.2242286503314972, + "step": 2480 + }, + { + "epoch": 0.4290144727773949, + "grad_norm": 4.334775447845459, + "learning_rate": 9.943883456867374e-08, + "logits/chosen": -2.8387622833251953, + "logits/rejected": -2.8354439735412598, + "logps/chosen": -68.5081558227539, + "logps/rejected": -74.07096862792969, + "loss": 0.6722, + "rewards/accuracies": 0.5562499761581421, + "rewards/chosen": -0.18232309818267822, + "rewards/margins": 0.04770932346582413, + "rewards/rejected": -0.23003241419792175, + "step": 2490 + }, + { + "epoch": 0.43073742246726393, + "grad_norm": 4.520421981811523, + "learning_rate": 9.942375892784464e-08, + "logits/chosen": -2.8706278800964355, + "logits/rejected": -2.852548122406006, + "logps/chosen": -77.92839050292969, + "logps/rejected": -80.6195068359375, + "loss": 0.6772, + "rewards/accuracies": 0.581250011920929, + "rewards/chosen": -0.2113899290561676, + "rewards/margins": 0.03781526908278465, + "rewards/rejected": -0.24920520186424255, + "step": 2500 + }, + { + "epoch": 0.432460372157133, + "grad_norm": 4.175434112548828, + "learning_rate": 9.940848463298563e-08, + "logits/chosen": -2.785006046295166, + "logits/rejected": -2.7793169021606445, + "logps/chosen": -73.74462127685547, + "logps/rejected": -76.24616241455078, + "loss": 0.6711, + "rewards/accuracies": 0.65625, + "rewards/chosen": -0.20549897849559784, + "rewards/margins": 0.04951024800539017, + "rewards/rejected": -0.2550092339515686, + "step": 2510 + }, + { + "epoch": 0.4341833218470021, + "grad_norm": 4.60468864440918, + "learning_rate": 9.939301174549025e-08, + "logits/chosen": -2.775952100753784, + "logits/rejected": -2.7564125061035156, + "logps/chosen": -73.282958984375, + "logps/rejected": -74.0230712890625, + "loss": 0.6674, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -0.20053362846374512, + "rewards/margins": 0.057301588356494904, + "rewards/rejected": -0.25783517956733704, + "step": 2520 + }, + { + "epoch": 0.4359062715368711, + "grad_norm": 5.754627704620361, + "learning_rate": 9.93773403275503e-08, + "logits/chosen": -2.8354198932647705, + "logits/rejected": -2.8365046977996826, + "logps/chosen": -73.2640151977539, + "logps/rejected": -76.65324401855469, + "loss": 0.6841, + "rewards/accuracies": 0.5249999761581421, + "rewards/chosen": -0.21130648255348206, + "rewards/margins": 0.024294385686516762, + "rewards/rejected": -0.23560085892677307, + "step": 2530 + }, + { + "epoch": 0.4376292212267402, + "grad_norm": 7.0299577713012695, + "learning_rate": 9.936147044215552e-08, + "logits/chosen": -2.8332438468933105, + "logits/rejected": -2.821242570877075, + "logps/chosen": -76.29328918457031, + "logps/rejected": -79.34749603271484, + "loss": 0.6766, + "rewards/accuracies": 0.59375, + "rewards/chosen": -0.22417590022087097, + "rewards/margins": 0.03858811408281326, + "rewards/rejected": -0.262764036655426, + "step": 2540 + }, + { + "epoch": 0.4393521709166092, + "grad_norm": 4.671040058135986, + "learning_rate": 9.934540215309342e-08, + "logits/chosen": -2.828977584838867, + "logits/rejected": -2.8029513359069824, + "logps/chosen": -80.964111328125, + "logps/rejected": -78.33306121826172, + "loss": 0.6766, + "rewards/accuracies": 0.606249988079071, + "rewards/chosen": -0.2089128941297531, + "rewards/margins": 0.041519373655319214, + "rewards/rejected": -0.2504322826862335, + "step": 2550 + }, + { + "epoch": 0.4410751206064783, + "grad_norm": 4.371002197265625, + "learning_rate": 9.932913552494887e-08, + "logits/chosen": -2.875154495239258, + "logits/rejected": -2.855593204498291, + "logps/chosen": -77.19731140136719, + "logps/rejected": -78.25651550292969, + "loss": 0.6788, + "rewards/accuracies": 0.6187499761581421, + "rewards/chosen": -0.210457444190979, + "rewards/margins": 0.034095216542482376, + "rewards/rejected": -0.24455265700817108, + "step": 2560 + }, + { + "epoch": 0.4427980702963473, + "grad_norm": 4.936331748962402, + "learning_rate": 9.931267062310407e-08, + "logits/chosen": -2.818891763687134, + "logits/rejected": -2.808858871459961, + "logps/chosen": -79.46900939941406, + "logps/rejected": -78.81519317626953, + "loss": 0.6793, + "rewards/accuracies": 0.5687500238418579, + "rewards/chosen": -0.19554241001605988, + "rewards/margins": 0.03420441597700119, + "rewards/rejected": -0.22974681854248047, + "step": 2570 + }, + { + "epoch": 0.4445210199862164, + "grad_norm": 4.78171968460083, + "learning_rate": 9.929600751373807e-08, + "logits/chosen": -2.839102268218994, + "logits/rejected": -2.8269808292388916, + "logps/chosen": -74.9248275756836, + "logps/rejected": -76.74894714355469, + "loss": 0.6785, + "rewards/accuracies": 0.5562499761581421, + "rewards/chosen": -0.20845679938793182, + "rewards/margins": 0.035146139562129974, + "rewards/rejected": -0.2436029464006424, + "step": 2580 + }, + { + "epoch": 0.4462439696760855, + "grad_norm": 6.440813064575195, + "learning_rate": 9.927914626382665e-08, + "logits/chosen": -2.8103909492492676, + "logits/rejected": -2.782987117767334, + "logps/chosen": -77.16362762451172, + "logps/rejected": -75.17143249511719, + "loss": 0.6709, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.2005034238100052, + "rewards/margins": 0.04929278790950775, + "rewards/rejected": -0.24979619681835175, + "step": 2590 + }, + { + "epoch": 0.4479669193659545, + "grad_norm": 5.180968761444092, + "learning_rate": 9.926208694114196e-08, + "logits/chosen": -2.833908796310425, + "logits/rejected": -2.8047003746032715, + "logps/chosen": -80.12142944335938, + "logps/rejected": -74.12831115722656, + "loss": 0.675, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -0.2155274599790573, + "rewards/margins": 0.043395183980464935, + "rewards/rejected": -0.25892263650894165, + "step": 2600 + }, + { + "epoch": 0.4496898690558236, + "grad_norm": 5.009751796722412, + "learning_rate": 9.924482961425232e-08, + "logits/chosen": -2.8132882118225098, + "logits/rejected": -2.7851760387420654, + "logps/chosen": -79.89849853515625, + "logps/rejected": -75.80006408691406, + "loss": 0.6793, + "rewards/accuracies": 0.606249988079071, + "rewards/chosen": -0.21182087063789368, + "rewards/margins": 0.03493381291627884, + "rewards/rejected": -0.24675467610359192, + "step": 2610 + }, + { + "epoch": 0.4514128187456926, + "grad_norm": 4.712821960449219, + "learning_rate": 9.922737435252189e-08, + "logits/chosen": -2.8382675647735596, + "logits/rejected": -2.8158297538757324, + "logps/chosen": -71.83258056640625, + "logps/rejected": -76.35133361816406, + "loss": 0.6626, + "rewards/accuracies": 0.65625, + "rewards/chosen": -0.19858713448047638, + "rewards/margins": 0.06706685572862625, + "rewards/rejected": -0.2656540274620056, + "step": 2620 + }, + { + "epoch": 0.4531357684355617, + "grad_norm": 5.183804988861084, + "learning_rate": 9.92097212261104e-08, + "logits/chosen": -2.7748849391937256, + "logits/rejected": -2.755716562271118, + "logps/chosen": -74.21707153320312, + "logps/rejected": -78.53225708007812, + "loss": 0.6618, + "rewards/accuracies": 0.65625, + "rewards/chosen": -0.18580952286720276, + "rewards/margins": 0.07012667506933212, + "rewards/rejected": -0.2559362053871155, + "step": 2630 + }, + { + "epoch": 0.4548587181254307, + "grad_norm": 6.439208030700684, + "learning_rate": 9.919187030597288e-08, + "logits/chosen": -2.7924511432647705, + "logits/rejected": -2.78043532371521, + "logps/chosen": -71.54872131347656, + "logps/rejected": -72.90419006347656, + "loss": 0.6738, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.20230814814567566, + "rewards/margins": 0.04494600370526314, + "rewards/rejected": -0.2472541779279709, + "step": 2640 + }, + { + "epoch": 0.4565816678152998, + "grad_norm": 4.538118362426758, + "learning_rate": 9.91738216638594e-08, + "logits/chosen": -2.7581429481506348, + "logits/rejected": -2.7481188774108887, + "logps/chosen": -72.51766204833984, + "logps/rejected": -76.71464538574219, + "loss": 0.6724, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.18791751563549042, + "rewards/margins": 0.04803906008601189, + "rewards/rejected": -0.23595662415027618, + "step": 2650 + }, + { + "epoch": 0.4583046175051689, + "grad_norm": 4.052431583404541, + "learning_rate": 9.915557537231472e-08, + "logits/chosen": -2.797879934310913, + "logits/rejected": -2.769853115081787, + "logps/chosen": -77.92301177978516, + "logps/rejected": -77.94435119628906, + "loss": 0.6642, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -0.19476445019245148, + "rewards/margins": 0.06543554365634918, + "rewards/rejected": -0.26019999384880066, + "step": 2660 + }, + { + "epoch": 0.4600275671950379, + "grad_norm": 4.2934112548828125, + "learning_rate": 9.913713150467805e-08, + "logits/chosen": -2.773261308670044, + "logits/rejected": -2.753025770187378, + "logps/chosen": -77.44517517089844, + "logps/rejected": -78.64486694335938, + "loss": 0.673, + "rewards/accuracies": 0.606249988079071, + "rewards/chosen": -0.22384801506996155, + "rewards/margins": 0.04718298092484474, + "rewards/rejected": -0.2710309624671936, + "step": 2670 + }, + { + "epoch": 0.461750516884907, + "grad_norm": 4.471557140350342, + "learning_rate": 9.911849013508274e-08, + "logits/chosen": -2.8380188941955566, + "logits/rejected": -2.814384698867798, + "logps/chosen": -81.60688781738281, + "logps/rejected": -78.93445587158203, + "loss": 0.6756, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": -0.2197762429714203, + "rewards/margins": 0.04226057603955269, + "rewards/rejected": -0.26203683018684387, + "step": 2680 + }, + { + "epoch": 0.463473466574776, + "grad_norm": 5.9764814376831055, + "learning_rate": 9.9099651338456e-08, + "logits/chosen": -2.7970428466796875, + "logits/rejected": -2.783587694168091, + "logps/chosen": -73.98988342285156, + "logps/rejected": -77.5247573852539, + "loss": 0.6692, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -0.22796630859375, + "rewards/margins": 0.05491773411631584, + "rewards/rejected": -0.28288403153419495, + "step": 2690 + }, + { + "epoch": 0.4651964162646451, + "grad_norm": 4.950373649597168, + "learning_rate": 9.908061519051851e-08, + "logits/chosen": -2.802163600921631, + "logits/rejected": -2.778865337371826, + "logps/chosen": -74.62218475341797, + "logps/rejected": -78.43358612060547, + "loss": 0.6701, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -0.20865485072135925, + "rewards/margins": 0.05166729539632797, + "rewards/rejected": -0.2603221535682678, + "step": 2700 + }, + { + "epoch": 0.4669193659545141, + "grad_norm": 5.659837245941162, + "learning_rate": 9.906138176778426e-08, + "logits/chosen": -2.8237104415893555, + "logits/rejected": -2.8062551021575928, + "logps/chosen": -81.23680877685547, + "logps/rejected": -77.97103881835938, + "loss": 0.6818, + "rewards/accuracies": 0.6187499761581421, + "rewards/chosen": -0.20353296399116516, + "rewards/margins": 0.02904905006289482, + "rewards/rejected": -0.2325820028781891, + "step": 2710 + }, + { + "epoch": 0.4686423156443832, + "grad_norm": 5.343284606933594, + "learning_rate": 9.904195114756013e-08, + "logits/chosen": -2.7975616455078125, + "logits/rejected": -2.804506301879883, + "logps/chosen": -75.00023651123047, + "logps/rejected": -79.24310302734375, + "loss": 0.6755, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.21659204363822937, + "rewards/margins": 0.04194109886884689, + "rewards/rejected": -0.25853317975997925, + "step": 2720 + }, + { + "epoch": 0.4703652653342522, + "grad_norm": 6.924177646636963, + "learning_rate": 9.90223234079456e-08, + "logits/chosen": -2.8070428371429443, + "logits/rejected": -2.7972311973571777, + "logps/chosen": -81.18864440917969, + "logps/rejected": -79.83781433105469, + "loss": 0.6759, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -0.2139493227005005, + "rewards/margins": 0.03996586799621582, + "rewards/rejected": -0.2539151906967163, + "step": 2730 + }, + { + "epoch": 0.4720882150241213, + "grad_norm": 5.208271503448486, + "learning_rate": 9.900249862783253e-08, + "logits/chosen": -2.7930102348327637, + "logits/rejected": -2.7804884910583496, + "logps/chosen": -75.43363952636719, + "logps/rejected": -72.59468078613281, + "loss": 0.6826, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.2124830037355423, + "rewards/margins": 0.02715051732957363, + "rewards/rejected": -0.23963353037834167, + "step": 2740 + }, + { + "epoch": 0.4738111647139904, + "grad_norm": 6.84669303894043, + "learning_rate": 9.898247688690467e-08, + "logits/chosen": -2.738506555557251, + "logits/rejected": -2.7383079528808594, + "logps/chosen": -69.57581329345703, + "logps/rejected": -77.10731506347656, + "loss": 0.6696, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.19511033594608307, + "rewards/margins": 0.053858477622270584, + "rewards/rejected": -0.24896883964538574, + "step": 2750 + }, + { + "epoch": 0.4755341144038594, + "grad_norm": 5.511141300201416, + "learning_rate": 9.896225826563748e-08, + "logits/chosen": -2.7798218727111816, + "logits/rejected": -2.772571325302124, + "logps/chosen": -76.56613159179688, + "logps/rejected": -80.88214111328125, + "loss": 0.6691, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -0.21049802005290985, + "rewards/margins": 0.05490432307124138, + "rewards/rejected": -0.26540234684944153, + "step": 2760 + }, + { + "epoch": 0.4772570640937285, + "grad_norm": 5.58799934387207, + "learning_rate": 9.894184284529776e-08, + "logits/chosen": -2.8549933433532715, + "logits/rejected": -2.82932710647583, + "logps/chosen": -76.98066711425781, + "logps/rejected": -76.55406188964844, + "loss": 0.6775, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.2074943482875824, + "rewards/margins": 0.03717175871133804, + "rewards/rejected": -0.24466614425182343, + "step": 2770 + }, + { + "epoch": 0.4789800137835975, + "grad_norm": 5.142263412475586, + "learning_rate": 9.892123070794331e-08, + "logits/chosen": -2.738034725189209, + "logits/rejected": -2.717013120651245, + "logps/chosen": -76.2380599975586, + "logps/rejected": -77.95225524902344, + "loss": 0.6714, + "rewards/accuracies": 0.6187499761581421, + "rewards/chosen": -0.23084776103496552, + "rewards/margins": 0.05035693198442459, + "rewards/rejected": -0.2812047302722931, + "step": 2780 + }, + { + "epoch": 0.4807029634734666, + "grad_norm": 5.2472734451293945, + "learning_rate": 9.890042193642267e-08, + "logits/chosen": -2.8136708736419678, + "logits/rejected": -2.789565324783325, + "logps/chosen": -74.58724212646484, + "logps/rejected": -76.51789855957031, + "loss": 0.668, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -0.21276339888572693, + "rewards/margins": 0.056669749319553375, + "rewards/rejected": -0.2694331407546997, + "step": 2790 + }, + { + "epoch": 0.4824259131633356, + "grad_norm": 6.30368185043335, + "learning_rate": 9.887941661437464e-08, + "logits/chosen": -2.8293824195861816, + "logits/rejected": -2.807325839996338, + "logps/chosen": -84.5650634765625, + "logps/rejected": -84.91242980957031, + "loss": 0.6726, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.24537885189056396, + "rewards/margins": 0.047574784606695175, + "rewards/rejected": -0.2929536700248718, + "step": 2800 + }, + { + "epoch": 0.4824259131633356, + "eval_logits/chosen": -2.8651082515716553, + "eval_logits/rejected": -2.861708641052246, + "eval_logps/chosen": -75.15972137451172, + "eval_logps/rejected": -82.17526245117188, + "eval_loss": 0.6792241334915161, + "eval_rewards/accuracies": 0.5766728520393372, + "eval_rewards/chosen": -0.16144251823425293, + "eval_rewards/margins": 0.032814137637615204, + "eval_rewards/rejected": -0.19425663352012634, + "eval_runtime": 383.2273, + "eval_samples_per_second": 11.231, + "eval_steps_per_second": 1.404, + "step": 2800 + }, + { + "epoch": 0.4841488628532047, + "grad_norm": 5.956264019012451, + "learning_rate": 9.885821482622812e-08, + "logits/chosen": -2.755012035369873, + "logits/rejected": -2.734792709350586, + "logps/chosen": -79.61177825927734, + "logps/rejected": -83.44078826904297, + "loss": 0.672, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.22803381085395813, + "rewards/margins": 0.051030099391937256, + "rewards/rejected": -0.2790639102458954, + "step": 2810 + }, + { + "epoch": 0.48587181254307377, + "grad_norm": 5.045444965362549, + "learning_rate": 9.883681665720162e-08, + "logits/chosen": -2.832822799682617, + "logits/rejected": -2.8235433101654053, + "logps/chosen": -79.9222640991211, + "logps/rejected": -79.30545806884766, + "loss": 0.6795, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": -0.24114854633808136, + "rewards/margins": 0.03448627144098282, + "rewards/rejected": -0.2756348252296448, + "step": 2820 + }, + { + "epoch": 0.4875947622329428, + "grad_norm": 5.051169395446777, + "learning_rate": 9.881522219330303e-08, + "logits/chosen": -2.709955930709839, + "logits/rejected": -2.697514057159424, + "logps/chosen": -79.45323944091797, + "logps/rejected": -83.29679107666016, + "loss": 0.6666, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.22701826691627502, + "rewards/margins": 0.060922276228666306, + "rewards/rejected": -0.2879405617713928, + "step": 2830 + }, + { + "epoch": 0.48931771192281187, + "grad_norm": 5.050863742828369, + "learning_rate": 9.879343152132922e-08, + "logits/chosen": -2.8177552223205566, + "logits/rejected": -2.808845043182373, + "logps/chosen": -78.31355285644531, + "logps/rejected": -79.11454010009766, + "loss": 0.6735, + "rewards/accuracies": 0.606249988079071, + "rewards/chosen": -0.24140088260173798, + "rewards/margins": 0.045556072145700455, + "rewards/rejected": -0.28695693612098694, + "step": 2840 + }, + { + "epoch": 0.4910406616126809, + "grad_norm": 4.998874187469482, + "learning_rate": 9.87714447288657e-08, + "logits/chosen": -2.7891416549682617, + "logits/rejected": -2.775479555130005, + "logps/chosen": -77.19549560546875, + "logps/rejected": -84.73179626464844, + "loss": 0.6641, + "rewards/accuracies": 0.65625, + "rewards/chosen": -0.2396540641784668, + "rewards/margins": 0.06720131635665894, + "rewards/rejected": -0.30685538053512573, + "step": 2850 + }, + { + "epoch": 0.49276361130255, + "grad_norm": 4.9351301193237305, + "learning_rate": 9.874926190428623e-08, + "logits/chosen": -2.779432535171509, + "logits/rejected": -2.76192569732666, + "logps/chosen": -78.51346588134766, + "logps/rejected": -80.85316467285156, + "loss": 0.6658, + "rewards/accuracies": 0.65625, + "rewards/chosen": -0.23384924232959747, + "rewards/margins": 0.062432099133729935, + "rewards/rejected": -0.2962813079357147, + "step": 2860 + }, + { + "epoch": 0.494486560992419, + "grad_norm": 6.352870941162109, + "learning_rate": 9.872688313675258e-08, + "logits/chosen": -2.8443655967712402, + "logits/rejected": -2.830497980117798, + "logps/chosen": -81.04945373535156, + "logps/rejected": -81.22298431396484, + "loss": 0.6719, + "rewards/accuracies": 0.6187499761581421, + "rewards/chosen": -0.24311265349388123, + "rewards/margins": 0.049511753022670746, + "rewards/rejected": -0.29262441396713257, + "step": 2870 + }, + { + "epoch": 0.4962095106822881, + "grad_norm": 6.229991912841797, + "learning_rate": 9.870430851621399e-08, + "logits/chosen": -2.859992504119873, + "logits/rejected": -2.8380045890808105, + "logps/chosen": -80.40449523925781, + "logps/rejected": -80.57371520996094, + "loss": 0.6644, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -0.24097883701324463, + "rewards/margins": 0.06652016192674637, + "rewards/rejected": -0.3074989914894104, + "step": 2880 + }, + { + "epoch": 0.49793246037215716, + "grad_norm": 5.559286594390869, + "learning_rate": 9.8681538133407e-08, + "logits/chosen": -2.8639652729034424, + "logits/rejected": -2.857152223587036, + "logps/chosen": -78.74775695800781, + "logps/rejected": -82.1052017211914, + "loss": 0.6705, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -0.24852073192596436, + "rewards/margins": 0.05366664007306099, + "rewards/rejected": -0.30218738317489624, + "step": 2890 + }, + { + "epoch": 0.4996554100620262, + "grad_norm": 6.255499362945557, + "learning_rate": 9.865857207985499e-08, + "logits/chosen": -2.813190221786499, + "logits/rejected": -2.807352066040039, + "logps/chosen": -76.86582946777344, + "logps/rejected": -80.31356811523438, + "loss": 0.6684, + "rewards/accuracies": 0.6187499761581421, + "rewards/chosen": -0.23233918845653534, + "rewards/margins": 0.05936668440699577, + "rewards/rejected": -0.291705846786499, + "step": 2900 + }, + { + "epoch": 0.5013783597518953, + "grad_norm": 6.7790398597717285, + "learning_rate": 9.863541044786776e-08, + "logits/chosen": -2.837186098098755, + "logits/rejected": -2.8275084495544434, + "logps/chosen": -82.68029022216797, + "logps/rejected": -86.81315612792969, + "loss": 0.6661, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -0.2539251446723938, + "rewards/margins": 0.0632144957780838, + "rewards/rejected": -0.3171396255493164, + "step": 2910 + }, + { + "epoch": 0.5031013094417643, + "grad_norm": 5.92188835144043, + "learning_rate": 9.861205333054126e-08, + "logits/chosen": -2.7996227741241455, + "logits/rejected": -2.794041872024536, + "logps/chosen": -80.66837310791016, + "logps/rejected": -87.4694595336914, + "loss": 0.6617, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -0.26344019174575806, + "rewards/margins": 0.07321880757808685, + "rewards/rejected": -0.3366590142250061, + "step": 2920 + }, + { + "epoch": 0.5048242591316333, + "grad_norm": 4.815412998199463, + "learning_rate": 9.858850082175718e-08, + "logits/chosen": -2.7737913131713867, + "logits/rejected": -2.7536840438842773, + "logps/chosen": -81.28785705566406, + "logps/rejected": -83.8777084350586, + "loss": 0.6648, + "rewards/accuracies": 0.65625, + "rewards/chosen": -0.27177712321281433, + "rewards/margins": 0.06520970165729523, + "rewards/rejected": -0.33698686957359314, + "step": 2930 + }, + { + "epoch": 0.5065472088215024, + "grad_norm": 5.337031364440918, + "learning_rate": 9.856475301618254e-08, + "logits/chosen": -2.8222336769104004, + "logits/rejected": -2.7931721210479736, + "logps/chosen": -79.30021667480469, + "logps/rejected": -80.7382583618164, + "loss": 0.6781, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -0.26826828718185425, + "rewards/margins": 0.03749905899167061, + "rewards/rejected": -0.30576735734939575, + "step": 2940 + }, + { + "epoch": 0.5082701585113715, + "grad_norm": 6.90012264251709, + "learning_rate": 9.854081000926937e-08, + "logits/chosen": -2.8115012645721436, + "logits/rejected": -2.7984066009521484, + "logps/chosen": -81.31830596923828, + "logps/rejected": -86.6155014038086, + "loss": 0.6643, + "rewards/accuracies": 0.6187499761581421, + "rewards/chosen": -0.2607055604457855, + "rewards/margins": 0.06801251322031021, + "rewards/rejected": -0.3287180960178375, + "step": 2950 + }, + { + "epoch": 0.5099931082012406, + "grad_norm": 7.280730724334717, + "learning_rate": 9.851667189725428e-08, + "logits/chosen": -2.7949349880218506, + "logits/rejected": -2.7746729850769043, + "logps/chosen": -81.17364501953125, + "logps/rejected": -83.45097351074219, + "loss": 0.6695, + "rewards/accuracies": 0.59375, + "rewards/chosen": -0.2636514902114868, + "rewards/margins": 0.05686463788151741, + "rewards/rejected": -0.3205161392688751, + "step": 2960 + }, + { + "epoch": 0.5117160578911096, + "grad_norm": 5.992996692657471, + "learning_rate": 9.849233877715805e-08, + "logits/chosen": -2.7750847339630127, + "logits/rejected": -2.7555854320526123, + "logps/chosen": -83.04480743408203, + "logps/rejected": -84.0536880493164, + "loss": 0.671, + "rewards/accuracies": 0.6187499761581421, + "rewards/chosen": -0.27440935373306274, + "rewards/margins": 0.052616190165281296, + "rewards/rejected": -0.32702553272247314, + "step": 2970 + }, + { + "epoch": 0.5134390075809786, + "grad_norm": 8.407181739807129, + "learning_rate": 9.846781074678536e-08, + "logits/chosen": -2.733599901199341, + "logits/rejected": -2.71382474899292, + "logps/chosen": -80.76288604736328, + "logps/rejected": -83.85502624511719, + "loss": 0.6649, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.25527653098106384, + "rewards/margins": 0.06554384529590607, + "rewards/rejected": -0.3208203911781311, + "step": 2980 + }, + { + "epoch": 0.5151619572708477, + "grad_norm": 6.10256290435791, + "learning_rate": 9.844308790472422e-08, + "logits/chosen": -2.778179168701172, + "logits/rejected": -2.7642033100128174, + "logps/chosen": -85.82478332519531, + "logps/rejected": -85.78666687011719, + "loss": 0.6773, + "rewards/accuracies": 0.581250011920929, + "rewards/chosen": -0.28172558546066284, + "rewards/margins": 0.04021336883306503, + "rewards/rejected": -0.32193902134895325, + "step": 2990 + }, + { + "epoch": 0.5168849069607168, + "grad_norm": 5.219784259796143, + "learning_rate": 9.841817035034571e-08, + "logits/chosen": -2.788325786590576, + "logits/rejected": -2.782817840576172, + "logps/chosen": -78.93931579589844, + "logps/rejected": -84.350341796875, + "loss": 0.6753, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.26191726326942444, + "rewards/margins": 0.044065751135349274, + "rewards/rejected": -0.3059830069541931, + "step": 3000 + }, + { + "epoch": 0.5186078566505858, + "grad_norm": 6.176671981811523, + "learning_rate": 9.839305818380355e-08, + "logits/chosen": -2.804821491241455, + "logits/rejected": -2.786984920501709, + "logps/chosen": -83.13731384277344, + "logps/rejected": -83.47764587402344, + "loss": 0.6774, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -0.2761631906032562, + "rewards/margins": 0.03957538679242134, + "rewards/rejected": -0.3157385587692261, + "step": 3010 + }, + { + "epoch": 0.5203308063404548, + "grad_norm": 7.169635772705078, + "learning_rate": 9.836775150603366e-08, + "logits/chosen": -2.8510546684265137, + "logits/rejected": -2.830547571182251, + "logps/chosen": -83.43858337402344, + "logps/rejected": -82.36913299560547, + "loss": 0.6723, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -0.2823221683502197, + "rewards/margins": 0.05141967535018921, + "rewards/rejected": -0.33374184370040894, + "step": 3020 + }, + { + "epoch": 0.5220537560303239, + "grad_norm": 7.140235424041748, + "learning_rate": 9.834225041875381e-08, + "logits/chosen": -2.815683126449585, + "logits/rejected": -2.799219846725464, + "logps/chosen": -83.18513488769531, + "logps/rejected": -84.9632339477539, + "loss": 0.6781, + "rewards/accuracies": 0.606249988079071, + "rewards/chosen": -0.2686838209629059, + "rewards/margins": 0.037618495523929596, + "rewards/rejected": -0.3063023090362549, + "step": 3030 + }, + { + "epoch": 0.523776705720193, + "grad_norm": 5.284769535064697, + "learning_rate": 9.831655502446314e-08, + "logits/chosen": -2.837007522583008, + "logits/rejected": -2.8341352939605713, + "logps/chosen": -77.17981719970703, + "logps/rejected": -83.68660736083984, + "loss": 0.6697, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -0.2541511356830597, + "rewards/margins": 0.053926460444927216, + "rewards/rejected": -0.3080775737762451, + "step": 3040 + }, + { + "epoch": 0.525499655410062, + "grad_norm": 6.481640815734863, + "learning_rate": 9.829066542644183e-08, + "logits/chosen": -2.7752394676208496, + "logits/rejected": -2.7717671394348145, + "logps/chosen": -79.08528900146484, + "logps/rejected": -84.80096435546875, + "loss": 0.6735, + "rewards/accuracies": 0.6187499761581421, + "rewards/chosen": -0.24965326488018036, + "rewards/margins": 0.046648941934108734, + "rewards/rejected": -0.2963022291660309, + "step": 3050 + }, + { + "epoch": 0.5272226050999311, + "grad_norm": 6.421154499053955, + "learning_rate": 9.826458172875056e-08, + "logits/chosen": -2.7950727939605713, + "logits/rejected": -2.7809386253356934, + "logps/chosen": -80.84529113769531, + "logps/rejected": -82.78802490234375, + "loss": 0.6762, + "rewards/accuracies": 0.59375, + "rewards/chosen": -0.2448175847530365, + "rewards/margins": 0.043594036251306534, + "rewards/rejected": -0.28841158747673035, + "step": 3060 + }, + { + "epoch": 0.5289455547898001, + "grad_norm": 5.6619720458984375, + "learning_rate": 9.823830403623031e-08, + "logits/chosen": -2.7709195613861084, + "logits/rejected": -2.752265691757202, + "logps/chosen": -84.43778991699219, + "logps/rejected": -84.05045318603516, + "loss": 0.6678, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.25266867876052856, + "rewards/margins": 0.058676183223724365, + "rewards/rejected": -0.31134486198425293, + "step": 3070 + }, + { + "epoch": 0.5306685044796692, + "grad_norm": 6.314658164978027, + "learning_rate": 9.821183245450169e-08, + "logits/chosen": -2.729401111602783, + "logits/rejected": -2.7170450687408447, + "logps/chosen": -79.04667663574219, + "logps/rejected": -85.69407653808594, + "loss": 0.6782, + "rewards/accuracies": 0.543749988079071, + "rewards/chosen": -0.28165268898010254, + "rewards/margins": 0.03992915153503418, + "rewards/rejected": -0.3215818405151367, + "step": 3080 + }, + { + "epoch": 0.5323914541695383, + "grad_norm": 6.192001819610596, + "learning_rate": 9.818516708996468e-08, + "logits/chosen": -2.764099597930908, + "logits/rejected": -2.74798846244812, + "logps/chosen": -81.06709289550781, + "logps/rejected": -85.3973388671875, + "loss": 0.6629, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.2586060166358948, + "rewards/margins": 0.06887234002351761, + "rewards/rejected": -0.3274783492088318, + "step": 3090 + }, + { + "epoch": 0.5341144038594073, + "grad_norm": 7.931329727172852, + "learning_rate": 9.815830804979814e-08, + "logits/chosen": -2.777858257293701, + "logits/rejected": -2.7577965259552, + "logps/chosen": -81.28909301757812, + "logps/rejected": -82.33088684082031, + "loss": 0.6688, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.25381529331207275, + "rewards/margins": 0.05860390514135361, + "rewards/rejected": -0.31241923570632935, + "step": 3100 + }, + { + "epoch": 0.5358373535492763, + "grad_norm": 9.322093963623047, + "learning_rate": 9.813125544195938e-08, + "logits/chosen": -2.753824472427368, + "logits/rejected": -2.7594916820526123, + "logps/chosen": -80.28367614746094, + "logps/rejected": -87.63134765625, + "loss": 0.6745, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.2879082262516022, + "rewards/margins": 0.045723266899585724, + "rewards/rejected": -0.3336314857006073, + "step": 3110 + }, + { + "epoch": 0.5375603032391454, + "grad_norm": 6.275984764099121, + "learning_rate": 9.810400937518376e-08, + "logits/chosen": -2.7972092628479004, + "logits/rejected": -2.778357982635498, + "logps/chosen": -82.89414978027344, + "logps/rejected": -87.1389389038086, + "loss": 0.6637, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -0.25661715865135193, + "rewards/margins": 0.06865452229976654, + "rewards/rejected": -0.3252716660499573, + "step": 3120 + }, + { + "epoch": 0.5392832529290145, + "grad_norm": 8.541313171386719, + "learning_rate": 9.807656995898422e-08, + "logits/chosen": -2.732752561569214, + "logits/rejected": -2.7282118797302246, + "logps/chosen": -80.03063201904297, + "logps/rejected": -85.75975799560547, + "loss": 0.6714, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.2712279260158539, + "rewards/margins": 0.054630815982818604, + "rewards/rejected": -0.3258587419986725, + "step": 3130 + }, + { + "epoch": 0.5410062026188835, + "grad_norm": 10.22986888885498, + "learning_rate": 9.80489373036508e-08, + "logits/chosen": -2.796969413757324, + "logits/rejected": -2.7837982177734375, + "logps/chosen": -83.42472839355469, + "logps/rejected": -89.79510498046875, + "loss": 0.6681, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": -0.3091219961643219, + "rewards/margins": 0.06103752925992012, + "rewards/rejected": -0.3701595067977905, + "step": 3140 + }, + { + "epoch": 0.5427291523087526, + "grad_norm": 7.790271282196045, + "learning_rate": 9.802111152025037e-08, + "logits/chosen": -2.814141035079956, + "logits/rejected": -2.7935421466827393, + "logps/chosen": -85.78771209716797, + "logps/rejected": -87.28602600097656, + "loss": 0.6746, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -0.30056244134902954, + "rewards/margins": 0.046315573155879974, + "rewards/rejected": -0.3468780219554901, + "step": 3150 + }, + { + "epoch": 0.5444521019986216, + "grad_norm": 6.410969257354736, + "learning_rate": 9.799309272062592e-08, + "logits/chosen": -2.761298656463623, + "logits/rejected": -2.7406160831451416, + "logps/chosen": -83.67314910888672, + "logps/rejected": -87.19869995117188, + "loss": 0.6613, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -0.28363630175590515, + "rewards/margins": 0.07660378515720367, + "rewards/rejected": -0.36024007201194763, + "step": 3160 + }, + { + "epoch": 0.5461750516884907, + "grad_norm": 6.787548065185547, + "learning_rate": 9.796488101739633e-08, + "logits/chosen": -2.7918598651885986, + "logits/rejected": -2.769052028656006, + "logps/chosen": -87.31083679199219, + "logps/rejected": -85.34876251220703, + "loss": 0.6624, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.3019317090511322, + "rewards/margins": 0.07252788543701172, + "rewards/rejected": -0.37445956468582153, + "step": 3170 + }, + { + "epoch": 0.5478980013783598, + "grad_norm": 5.428675174713135, + "learning_rate": 9.793647652395582e-08, + "logits/chosen": -2.8168251514434814, + "logits/rejected": -2.7892673015594482, + "logps/chosen": -83.43238830566406, + "logps/rejected": -87.53303527832031, + "loss": 0.6618, + "rewards/accuracies": 0.606249988079071, + "rewards/chosen": -0.2937379479408264, + "rewards/margins": 0.07411986589431763, + "rewards/rejected": -0.36785784363746643, + "step": 3180 + }, + { + "epoch": 0.5496209510682288, + "grad_norm": 6.892975807189941, + "learning_rate": 9.79078793544735e-08, + "logits/chosen": -2.8068017959594727, + "logits/rejected": -2.8054256439208984, + "logps/chosen": -83.6307601928711, + "logps/rejected": -94.66683197021484, + "loss": 0.6563, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -0.3018225133419037, + "rewards/margins": 0.08390899747610092, + "rewards/rejected": -0.3857315182685852, + "step": 3190 + }, + { + "epoch": 0.5513439007580979, + "grad_norm": 8.321106910705566, + "learning_rate": 9.787908962389295e-08, + "logits/chosen": -2.749690055847168, + "logits/rejected": -2.7342047691345215, + "logps/chosen": -87.84234619140625, + "logps/rejected": -89.13800048828125, + "loss": 0.6643, + "rewards/accuracies": 0.606249988079071, + "rewards/chosen": -0.3047778010368347, + "rewards/margins": 0.06937982141971588, + "rewards/rejected": -0.3741576373577118, + "step": 3200 + }, + { + "epoch": 0.5513439007580979, + "eval_logits/chosen": -2.842041015625, + "eval_logits/rejected": -2.8386902809143066, + "eval_logps/chosen": -84.82251739501953, + "eval_logps/rejected": -93.49150848388672, + "eval_loss": 0.672879159450531, + "eval_rewards/accuracies": 0.5947955250740051, + "eval_rewards/chosen": -0.2580704391002655, + "eval_rewards/margins": 0.049348585307598114, + "eval_rewards/rejected": -0.3074190318584442, + "eval_runtime": 382.8841, + "eval_samples_per_second": 11.241, + "eval_steps_per_second": 1.405, + "step": 3200 + }, + { + "epoch": 0.5530668504479669, + "grad_norm": 9.703495979309082, + "learning_rate": 9.785010744793172e-08, + "logits/chosen": -2.6921780109405518, + "logits/rejected": -2.672276735305786, + "logps/chosen": -89.73343658447266, + "logps/rejected": -93.46687316894531, + "loss": 0.6672, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -0.3509143590927124, + "rewards/margins": 0.06420746445655823, + "rewards/rejected": -0.415121853351593, + "step": 3210 + }, + { + "epoch": 0.554789800137836, + "grad_norm": 7.2378740310668945, + "learning_rate": 9.782093294308085e-08, + "logits/chosen": -2.73669695854187, + "logits/rejected": -2.7342042922973633, + "logps/chosen": -85.49198150634766, + "logps/rejected": -90.95780181884766, + "loss": 0.6759, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.3446595072746277, + "rewards/margins": 0.04523409903049469, + "rewards/rejected": -0.38989362120628357, + "step": 3220 + }, + { + "epoch": 0.556512749827705, + "grad_norm": 8.422399520874023, + "learning_rate": 9.779156622660444e-08, + "logits/chosen": -2.7746098041534424, + "logits/rejected": -2.763091564178467, + "logps/chosen": -86.66752624511719, + "logps/rejected": -93.50808715820312, + "loss": 0.6708, + "rewards/accuracies": 0.606249988079071, + "rewards/chosen": -0.32212287187576294, + "rewards/margins": 0.0574420690536499, + "rewards/rejected": -0.37956494092941284, + "step": 3230 + }, + { + "epoch": 0.5582356995175741, + "grad_norm": 7.248619556427002, + "learning_rate": 9.77620074165392e-08, + "logits/chosen": -2.8578269481658936, + "logits/rejected": -2.8408889770507812, + "logps/chosen": -92.19932556152344, + "logps/rejected": -91.2228012084961, + "loss": 0.6734, + "rewards/accuracies": 0.59375, + "rewards/chosen": -0.3484489321708679, + "rewards/margins": 0.05245286971330643, + "rewards/rejected": -0.40090179443359375, + "step": 3240 + }, + { + "epoch": 0.5599586492074431, + "grad_norm": 7.993571758270264, + "learning_rate": 9.77322566316939e-08, + "logits/chosen": -2.776008367538452, + "logits/rejected": -2.7632105350494385, + "logps/chosen": -85.61407470703125, + "logps/rejected": -91.9402847290039, + "loss": 0.6633, + "rewards/accuracies": 0.65625, + "rewards/chosen": -0.3181673586368561, + "rewards/margins": 0.07308965176343918, + "rewards/rejected": -0.3912569582462311, + "step": 3250 + }, + { + "epoch": 0.5616815988973122, + "grad_norm": 6.735551357269287, + "learning_rate": 9.770231399164894e-08, + "logits/chosen": -2.7842421531677246, + "logits/rejected": -2.7732491493225098, + "logps/chosen": -83.95924377441406, + "logps/rejected": -89.4464111328125, + "loss": 0.6662, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -0.3159296214580536, + "rewards/margins": 0.06474421918392181, + "rewards/rejected": -0.3806738257408142, + "step": 3260 + }, + { + "epoch": 0.5634045485871813, + "grad_norm": 7.405908107757568, + "learning_rate": 9.76721796167559e-08, + "logits/chosen": -2.8190155029296875, + "logits/rejected": -2.8150835037231445, + "logps/chosen": -90.64122772216797, + "logps/rejected": -97.86971282958984, + "loss": 0.6661, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -0.345743328332901, + "rewards/margins": 0.06943206489086151, + "rewards/rejected": -0.4151753783226013, + "step": 3270 + }, + { + "epoch": 0.5651274982770503, + "grad_norm": 9.555808067321777, + "learning_rate": 9.764185362813697e-08, + "logits/chosen": -2.8297626972198486, + "logits/rejected": -2.820317506790161, + "logps/chosen": -83.25923156738281, + "logps/rejected": -90.16539001464844, + "loss": 0.6717, + "rewards/accuracies": 0.59375, + "rewards/chosen": -0.32972145080566406, + "rewards/margins": 0.054350681602954865, + "rewards/rejected": -0.38407212495803833, + "step": 3280 + }, + { + "epoch": 0.5668504479669194, + "grad_norm": 7.138297080993652, + "learning_rate": 9.761133614768454e-08, + "logits/chosen": -2.860161542892456, + "logits/rejected": -2.836207389831543, + "logps/chosen": -84.72539520263672, + "logps/rejected": -92.99263763427734, + "loss": 0.6543, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -0.30811357498168945, + "rewards/margins": 0.09081108123064041, + "rewards/rejected": -0.39892467856407166, + "step": 3290 + }, + { + "epoch": 0.5685733976567884, + "grad_norm": 13.185148239135742, + "learning_rate": 9.758062729806067e-08, + "logits/chosen": -2.769252061843872, + "logits/rejected": -2.7512991428375244, + "logps/chosen": -91.40082550048828, + "logps/rejected": -95.87007141113281, + "loss": 0.6628, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.3581240773200989, + "rewards/margins": 0.07508586347103119, + "rewards/rejected": -0.43320995569229126, + "step": 3300 + }, + { + "epoch": 0.5702963473466575, + "grad_norm": 7.241480827331543, + "learning_rate": 9.754972720269664e-08, + "logits/chosen": -2.722712993621826, + "logits/rejected": -2.697404384613037, + "logps/chosen": -88.21076965332031, + "logps/rejected": -92.04688262939453, + "loss": 0.6605, + "rewards/accuracies": 0.65625, + "rewards/chosen": -0.33576661348342896, + "rewards/margins": 0.081563800573349, + "rewards/rejected": -0.41733041405677795, + "step": 3310 + }, + { + "epoch": 0.5720192970365265, + "grad_norm": 8.014399528503418, + "learning_rate": 9.751863598579238e-08, + "logits/chosen": -2.750521183013916, + "logits/rejected": -2.727144956588745, + "logps/chosen": -89.49032592773438, + "logps/rejected": -90.572265625, + "loss": 0.6641, + "rewards/accuracies": 0.6187499761581421, + "rewards/chosen": -0.35034722089767456, + "rewards/margins": 0.0724496990442276, + "rewards/rejected": -0.42279696464538574, + "step": 3320 + }, + { + "epoch": 0.5737422467263956, + "grad_norm": 9.008350372314453, + "learning_rate": 9.748735377231605e-08, + "logits/chosen": -2.8291258811950684, + "logits/rejected": -2.8091487884521484, + "logps/chosen": -87.87861633300781, + "logps/rejected": -95.35037231445312, + "loss": 0.6536, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.3266400396823883, + "rewards/margins": 0.0941314548254013, + "rewards/rejected": -0.4207715094089508, + "step": 3330 + }, + { + "epoch": 0.5754651964162646, + "grad_norm": 8.974753379821777, + "learning_rate": 9.745588068800347e-08, + "logits/chosen": -2.785029172897339, + "logits/rejected": -2.7689337730407715, + "logps/chosen": -93.52928161621094, + "logps/rejected": -96.78819274902344, + "loss": 0.6607, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -0.3577708601951599, + "rewards/margins": 0.08071313798427582, + "rewards/rejected": -0.4384840428829193, + "step": 3340 + }, + { + "epoch": 0.5771881461061337, + "grad_norm": 9.425326347351074, + "learning_rate": 9.742421685935769e-08, + "logits/chosen": -2.7120304107666016, + "logits/rejected": -2.7023465633392334, + "logps/chosen": -92.40120697021484, + "logps/rejected": -97.11851501464844, + "loss": 0.6694, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -0.35480552911758423, + "rewards/margins": 0.060017216950654984, + "rewards/rejected": -0.4148227572441101, + "step": 3350 + }, + { + "epoch": 0.5789110957960028, + "grad_norm": 6.583974361419678, + "learning_rate": 9.739236241364839e-08, + "logits/chosen": -2.7767224311828613, + "logits/rejected": -2.753446340560913, + "logps/chosen": -91.22637176513672, + "logps/rejected": -94.33473205566406, + "loss": 0.6637, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": -0.35612645745277405, + "rewards/margins": 0.07528246194124222, + "rewards/rejected": -0.4314088821411133, + "step": 3360 + }, + { + "epoch": 0.5806340454858718, + "grad_norm": 9.38561725616455, + "learning_rate": 9.736031747891145e-08, + "logits/chosen": -2.7565040588378906, + "logits/rejected": -2.7531654834747314, + "logps/chosen": -86.56519317626953, + "logps/rejected": -96.07508850097656, + "loss": 0.6564, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -0.34099626541137695, + "rewards/margins": 0.08686031401157379, + "rewards/rejected": -0.42785653471946716, + "step": 3370 + }, + { + "epoch": 0.5823569951757409, + "grad_norm": 13.254312515258789, + "learning_rate": 9.732808218394841e-08, + "logits/chosen": -2.808115005493164, + "logits/rejected": -2.7841153144836426, + "logps/chosen": -90.53816986083984, + "logps/rejected": -90.27338409423828, + "loss": 0.6664, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.334627240896225, + "rewards/margins": 0.0670941025018692, + "rewards/rejected": -0.40172138810157776, + "step": 3380 + }, + { + "epoch": 0.5840799448656099, + "grad_norm": 11.683048248291016, + "learning_rate": 9.729565665832591e-08, + "logits/chosen": -2.764824390411377, + "logits/rejected": -2.7460172176361084, + "logps/chosen": -87.72078704833984, + "logps/rejected": -89.53865814208984, + "loss": 0.6725, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -0.3330671489238739, + "rewards/margins": 0.05587635189294815, + "rewards/rejected": -0.38894352316856384, + "step": 3390 + }, + { + "epoch": 0.585802894555479, + "grad_norm": 8.081195831298828, + "learning_rate": 9.726304103237522e-08, + "logits/chosen": -2.79416823387146, + "logits/rejected": -2.765594482421875, + "logps/chosen": -85.45794677734375, + "logps/rejected": -91.94193267822266, + "loss": 0.6539, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.3184451162815094, + "rewards/margins": 0.09561134874820709, + "rewards/rejected": -0.4140564799308777, + "step": 3400 + }, + { + "epoch": 0.587525844245348, + "grad_norm": 7.517322063446045, + "learning_rate": 9.723023543719171e-08, + "logits/chosen": -2.7181074619293213, + "logits/rejected": -2.6986443996429443, + "logps/chosen": -80.88175964355469, + "logps/rejected": -83.81219482421875, + "loss": 0.6645, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.30681008100509644, + "rewards/margins": 0.06839577108621597, + "rewards/rejected": -0.3752058446407318, + "step": 3410 + }, + { + "epoch": 0.5892487939352171, + "grad_norm": 8.775835037231445, + "learning_rate": 9.719724000463429e-08, + "logits/chosen": -2.730861186981201, + "logits/rejected": -2.7171430587768555, + "logps/chosen": -83.10078430175781, + "logps/rejected": -90.54771423339844, + "loss": 0.6586, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -0.2924661636352539, + "rewards/margins": 0.08081065118312836, + "rewards/rejected": -0.37327679991722107, + "step": 3420 + }, + { + "epoch": 0.5909717436250862, + "grad_norm": 9.496828079223633, + "learning_rate": 9.716405486732494e-08, + "logits/chosen": -2.8000082969665527, + "logits/rejected": -2.7859044075012207, + "logps/chosen": -83.32926940917969, + "logps/rejected": -92.90452575683594, + "loss": 0.6599, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -0.31072303652763367, + "rewards/margins": 0.07991008460521698, + "rewards/rejected": -0.39063313603401184, + "step": 3430 + }, + { + "epoch": 0.5926946933149552, + "grad_norm": 8.886645317077637, + "learning_rate": 9.71306801586481e-08, + "logits/chosen": -2.732738494873047, + "logits/rejected": -2.719587802886963, + "logps/chosen": -89.0845718383789, + "logps/rejected": -96.92495727539062, + "loss": 0.657, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -0.36353951692581177, + "rewards/margins": 0.08779202401638031, + "rewards/rejected": -0.4513315260410309, + "step": 3440 + }, + { + "epoch": 0.5944176430048242, + "grad_norm": 8.436723709106445, + "learning_rate": 9.709711601275018e-08, + "logits/chosen": -2.9035556316375732, + "logits/rejected": -2.8733808994293213, + "logps/chosen": -98.23624420166016, + "logps/rejected": -97.32003021240234, + "loss": 0.6699, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.3759039342403412, + "rewards/margins": 0.061831988394260406, + "rewards/rejected": -0.4377359449863434, + "step": 3450 + }, + { + "epoch": 0.5961405926946933, + "grad_norm": 9.629737854003906, + "learning_rate": 9.706336256453906e-08, + "logits/chosen": -2.745333433151245, + "logits/rejected": -2.7406082153320312, + "logps/chosen": -87.07964324951172, + "logps/rejected": -94.35381317138672, + "loss": 0.6641, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -0.34154319763183594, + "rewards/margins": 0.07244926691055298, + "rewards/rejected": -0.4139924645423889, + "step": 3460 + }, + { + "epoch": 0.5978635423845624, + "grad_norm": 7.333967208862305, + "learning_rate": 9.702941994968345e-08, + "logits/chosen": -2.7896111011505127, + "logits/rejected": -2.7804951667785645, + "logps/chosen": -94.11683654785156, + "logps/rejected": -97.30807495117188, + "loss": 0.6645, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -0.3686944842338562, + "rewards/margins": 0.07138319313526154, + "rewards/rejected": -0.44007769227027893, + "step": 3470 + }, + { + "epoch": 0.5995864920744314, + "grad_norm": 9.837492942810059, + "learning_rate": 9.699528830461241e-08, + "logits/chosen": -2.76103138923645, + "logits/rejected": -2.7402420043945312, + "logps/chosen": -94.16011810302734, + "logps/rejected": -97.65756225585938, + "loss": 0.6618, + "rewards/accuracies": 0.6187499761581421, + "rewards/chosen": -0.3834958076477051, + "rewards/margins": 0.08097387850284576, + "rewards/rejected": -0.4644697308540344, + "step": 3480 + }, + { + "epoch": 0.6013094417643005, + "grad_norm": 12.011924743652344, + "learning_rate": 9.69609677665148e-08, + "logits/chosen": -2.759596347808838, + "logits/rejected": -2.7367639541625977, + "logps/chosen": -94.12642669677734, + "logps/rejected": -102.76054382324219, + "loss": 0.6558, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -0.39481350779533386, + "rewards/margins": 0.09345494955778122, + "rewards/rejected": -0.4882684648036957, + "step": 3490 + }, + { + "epoch": 0.6030323914541695, + "grad_norm": 7.919803142547607, + "learning_rate": 9.692645847333871e-08, + "logits/chosen": -2.7291781902313232, + "logits/rejected": -2.7234292030334473, + "logps/chosen": -90.75396728515625, + "logps/rejected": -97.35155487060547, + "loss": 0.6711, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -0.380003422498703, + "rewards/margins": 0.05795666575431824, + "rewards/rejected": -0.43796008825302124, + "step": 3500 + }, + { + "epoch": 0.6047553411440386, + "grad_norm": 6.984891891479492, + "learning_rate": 9.689176056379091e-08, + "logits/chosen": -2.6967155933380127, + "logits/rejected": -2.6760733127593994, + "logps/chosen": -93.64738464355469, + "logps/rejected": -96.0606460571289, + "loss": 0.6701, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.39596277475357056, + "rewards/margins": 0.06031836196780205, + "rewards/rejected": -0.4562811255455017, + "step": 3510 + }, + { + "epoch": 0.6064782908339077, + "grad_norm": 8.624043464660645, + "learning_rate": 9.68568741773363e-08, + "logits/chosen": -2.71110463142395, + "logits/rejected": -2.6912643909454346, + "logps/chosen": -90.24055480957031, + "logps/rejected": -95.48713684082031, + "loss": 0.6522, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -0.3449710011482239, + "rewards/margins": 0.09635841846466064, + "rewards/rejected": -0.44132938981056213, + "step": 3520 + }, + { + "epoch": 0.6082012405237767, + "grad_norm": 9.641185760498047, + "learning_rate": 9.682179945419735e-08, + "logits/chosen": -2.834808111190796, + "logits/rejected": -2.797631025314331, + "logps/chosen": -93.6556625366211, + "logps/rejected": -97.22142028808594, + "loss": 0.6525, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -0.3713035583496094, + "rewards/margins": 0.0992220938205719, + "rewards/rejected": -0.4705256521701813, + "step": 3530 + }, + { + "epoch": 0.6099241902136457, + "grad_norm": 8.934781074523926, + "learning_rate": 9.678653653535353e-08, + "logits/chosen": -2.7017455101013184, + "logits/rejected": -2.6818392276763916, + "logps/chosen": -96.44803619384766, + "logps/rejected": -99.64411926269531, + "loss": 0.6686, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -0.41043099761009216, + "rewards/margins": 0.06640861928462982, + "rewards/rejected": -0.4768396317958832, + "step": 3540 + }, + { + "epoch": 0.6116471399035148, + "grad_norm": 9.147663116455078, + "learning_rate": 9.675108556254073e-08, + "logits/chosen": -2.7528347969055176, + "logits/rejected": -2.7469847202301025, + "logps/chosen": -97.86283111572266, + "logps/rejected": -99.20372772216797, + "loss": 0.6765, + "rewards/accuracies": 0.581250011920929, + "rewards/chosen": -0.42837315797805786, + "rewards/margins": 0.04629306495189667, + "rewards/rejected": -0.47466620802879333, + "step": 3550 + }, + { + "epoch": 0.6133700895933839, + "grad_norm": 8.061542510986328, + "learning_rate": 9.67154466782507e-08, + "logits/chosen": -2.7205893993377686, + "logits/rejected": -2.701850414276123, + "logps/chosen": -94.24388122558594, + "logps/rejected": -96.93658447265625, + "loss": 0.6704, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": -0.39585188031196594, + "rewards/margins": 0.05933908745646477, + "rewards/rejected": -0.4551909565925598, + "step": 3560 + }, + { + "epoch": 0.6150930392832529, + "grad_norm": 7.042123794555664, + "learning_rate": 9.667962002573053e-08, + "logits/chosen": -2.8203914165496826, + "logits/rejected": -2.795574426651001, + "logps/chosen": -97.29734802246094, + "logps/rejected": -98.71199798583984, + "loss": 0.6682, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -0.39522475004196167, + "rewards/margins": 0.0649743527173996, + "rewards/rejected": -0.46019911766052246, + "step": 3570 + }, + { + "epoch": 0.616815988973122, + "grad_norm": 8.811630249023438, + "learning_rate": 9.664360574898196e-08, + "logits/chosen": -2.792978286743164, + "logits/rejected": -2.7760887145996094, + "logps/chosen": -98.10456848144531, + "logps/rejected": -102.02606964111328, + "loss": 0.668, + "rewards/accuracies": 0.606249988079071, + "rewards/chosen": -0.3899100124835968, + "rewards/margins": 0.06529586017131805, + "rewards/rejected": -0.45520591735839844, + "step": 3580 + }, + { + "epoch": 0.618538938662991, + "grad_norm": 9.826918601989746, + "learning_rate": 9.660740399276092e-08, + "logits/chosen": -2.7625770568847656, + "logits/rejected": -2.7533459663391113, + "logps/chosen": -96.68208312988281, + "logps/rejected": -100.1490707397461, + "loss": 0.6715, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.4101640284061432, + "rewards/margins": 0.062378399074077606, + "rewards/rejected": -0.4725424349308014, + "step": 3590 + }, + { + "epoch": 0.6202618883528601, + "grad_norm": 9.431441307067871, + "learning_rate": 9.657101490257689e-08, + "logits/chosen": -2.7645809650421143, + "logits/rejected": -2.7437069416046143, + "logps/chosen": -90.44786071777344, + "logps/rejected": -93.78791809082031, + "loss": 0.6614, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.34735265374183655, + "rewards/margins": 0.07902725040912628, + "rewards/rejected": -0.42637985944747925, + "step": 3600 + }, + { + "epoch": 0.6202618883528601, + "eval_logits/chosen": -2.8143951892852783, + "eval_logits/rejected": -2.811262845993042, + "eval_logps/chosen": -84.9094467163086, + "eval_logps/rejected": -93.341552734375, + "eval_loss": 0.6740313172340393, + "eval_rewards/accuracies": 0.5903810262680054, + "eval_rewards/chosen": -0.2589397132396698, + "eval_rewards/margins": 0.04697979614138603, + "eval_rewards/rejected": -0.3059195280075073, + "eval_runtime": 383.2425, + "eval_samples_per_second": 11.23, + "eval_steps_per_second": 1.404, + "step": 3600 + }, + { + "epoch": 0.6219848380427292, + "grad_norm": 9.241872787475586, + "learning_rate": 9.653443862469226e-08, + "logits/chosen": -2.7348623275756836, + "logits/rejected": -2.7261312007904053, + "logps/chosen": -92.20159149169922, + "logps/rejected": -92.39556884765625, + "loss": 0.6797, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": -0.3961929976940155, + "rewards/margins": 0.041474197059869766, + "rewards/rejected": -0.43766722083091736, + "step": 3610 + }, + { + "epoch": 0.6237077877325982, + "grad_norm": 9.939810752868652, + "learning_rate": 9.64976753061219e-08, + "logits/chosen": -2.6818900108337402, + "logits/rejected": -2.663252353668213, + "logps/chosen": -91.36266326904297, + "logps/rejected": -97.33293151855469, + "loss": 0.6522, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -0.34590521454811096, + "rewards/margins": 0.09555970132350922, + "rewards/rejected": -0.441464900970459, + "step": 3620 + }, + { + "epoch": 0.6254307374224672, + "grad_norm": 7.1370673179626465, + "learning_rate": 9.646072509463239e-08, + "logits/chosen": -2.7756075859069824, + "logits/rejected": -2.774402379989624, + "logps/chosen": -90.3294448852539, + "logps/rejected": -103.80897521972656, + "loss": 0.6467, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.37961071729660034, + "rewards/margins": 0.11030860990285873, + "rewards/rejected": -0.4899192750453949, + "step": 3630 + }, + { + "epoch": 0.6271536871123363, + "grad_norm": 7.43474817276001, + "learning_rate": 9.642358813874154e-08, + "logits/chosen": -2.7568860054016113, + "logits/rejected": -2.747313976287842, + "logps/chosen": -93.8477783203125, + "logps/rejected": -102.84173583984375, + "loss": 0.6506, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -0.3741520941257477, + "rewards/margins": 0.10234732925891876, + "rewards/rejected": -0.4764993190765381, + "step": 3640 + }, + { + "epoch": 0.6288766368022054, + "grad_norm": 9.955434799194336, + "learning_rate": 9.638626458771779e-08, + "logits/chosen": -2.728194236755371, + "logits/rejected": -2.7346928119659424, + "logps/chosen": -90.55718231201172, + "logps/rejected": -102.41837310791016, + "loss": 0.6538, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -0.38671961426734924, + "rewards/margins": 0.09734572470188141, + "rewards/rejected": -0.48406535387039185, + "step": 3650 + }, + { + "epoch": 0.6305995864920745, + "grad_norm": 8.804527282714844, + "learning_rate": 9.63487545915795e-08, + "logits/chosen": -2.750636577606201, + "logits/rejected": -2.7261979579925537, + "logps/chosen": -99.72705078125, + "logps/rejected": -106.32795715332031, + "loss": 0.6507, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.4412182867527008, + "rewards/margins": 0.10072751343250275, + "rewards/rejected": -0.5419458150863647, + "step": 3660 + }, + { + "epoch": 0.6323225361819435, + "grad_norm": 8.900711059570312, + "learning_rate": 9.631105830109454e-08, + "logits/chosen": -2.7332570552825928, + "logits/rejected": -2.715251922607422, + "logps/chosen": -100.47251892089844, + "logps/rejected": -103.1319808959961, + "loss": 0.677, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.465008407831192, + "rewards/margins": 0.049564313143491745, + "rewards/rejected": -0.5145727396011353, + "step": 3670 + }, + { + "epoch": 0.6340454858718125, + "grad_norm": 10.140378952026367, + "learning_rate": 9.627317586777947e-08, + "logits/chosen": -2.7660374641418457, + "logits/rejected": -2.736175775527954, + "logps/chosen": -101.82589721679688, + "logps/rejected": -101.44891357421875, + "loss": 0.6696, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.4551924169063568, + "rewards/margins": 0.06743863224983215, + "rewards/rejected": -0.522631049156189, + "step": 3680 + }, + { + "epoch": 0.6357684355616816, + "grad_norm": 8.872066497802734, + "learning_rate": 9.623510744389908e-08, + "logits/chosen": -2.709749698638916, + "logits/rejected": -2.7123544216156006, + "logps/chosen": -94.74894714355469, + "logps/rejected": -110.99127197265625, + "loss": 0.6495, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.41778117418289185, + "rewards/margins": 0.11049274355173111, + "rewards/rejected": -0.52827388048172, + "step": 3690 + }, + { + "epoch": 0.6374913852515507, + "grad_norm": 12.560961723327637, + "learning_rate": 9.619685318246575e-08, + "logits/chosen": -2.741328716278076, + "logits/rejected": -2.7154643535614014, + "logps/chosen": -101.0983657836914, + "logps/rejected": -108.60958099365234, + "loss": 0.662, + "rewards/accuracies": 0.6187499761581421, + "rewards/chosen": -0.4249047636985779, + "rewards/margins": 0.08166369795799255, + "rewards/rejected": -0.5065684914588928, + "step": 3700 + }, + { + "epoch": 0.6392143349414197, + "grad_norm": 9.318950653076172, + "learning_rate": 9.615841323723878e-08, + "logits/chosen": -2.7656779289245605, + "logits/rejected": -2.7480380535125732, + "logps/chosen": -96.64082336425781, + "logps/rejected": -97.5372543334961, + "loss": 0.6736, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": -0.4158157408237457, + "rewards/margins": 0.05774500221014023, + "rewards/rejected": -0.4735606610774994, + "step": 3710 + }, + { + "epoch": 0.6409372846312887, + "grad_norm": 9.425246238708496, + "learning_rate": 9.611978776272381e-08, + "logits/chosen": -2.7584054470062256, + "logits/rejected": -2.7434277534484863, + "logps/chosen": -91.47366333007812, + "logps/rejected": -102.65911865234375, + "loss": 0.6477, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -0.38570934534072876, + "rewards/margins": 0.10816343873739243, + "rewards/rejected": -0.493872731924057, + "step": 3720 + }, + { + "epoch": 0.6426602343211578, + "grad_norm": 8.263059616088867, + "learning_rate": 9.608097691417222e-08, + "logits/chosen": -2.7512903213500977, + "logits/rejected": -2.7288222312927246, + "logps/chosen": -92.90215301513672, + "logps/rejected": -99.7225570678711, + "loss": 0.6431, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -0.3674978017807007, + "rewards/margins": 0.12553486227989197, + "rewards/rejected": -0.49303263425827026, + "step": 3730 + }, + { + "epoch": 0.6443831840110269, + "grad_norm": 9.822864532470703, + "learning_rate": 9.604198084758046e-08, + "logits/chosen": -2.736889362335205, + "logits/rejected": -2.719757556915283, + "logps/chosen": -92.25497436523438, + "logps/rejected": -103.63455963134766, + "loss": 0.654, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.3756372332572937, + "rewards/margins": 0.09760434180498123, + "rewards/rejected": -0.47324156761169434, + "step": 3740 + }, + { + "epoch": 0.646106133700896, + "grad_norm": 8.369536399841309, + "learning_rate": 9.600279971968947e-08, + "logits/chosen": -2.791196346282959, + "logits/rejected": -2.7723984718322754, + "logps/chosen": -94.96142578125, + "logps/rejected": -100.91236877441406, + "loss": 0.6624, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.40177279710769653, + "rewards/margins": 0.08017977327108383, + "rewards/rejected": -0.4819525182247162, + "step": 3750 + }, + { + "epoch": 0.647829083390765, + "grad_norm": 11.309000968933105, + "learning_rate": 9.5963433687984e-08, + "logits/chosen": -2.7534823417663574, + "logits/rejected": -2.746103525161743, + "logps/chosen": -101.5938491821289, + "logps/rejected": -104.46724700927734, + "loss": 0.6774, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -0.44744873046875, + "rewards/margins": 0.05144646763801575, + "rewards/rejected": -0.49889522790908813, + "step": 3760 + }, + { + "epoch": 0.649552033080634, + "grad_norm": 10.615482330322266, + "learning_rate": 9.592388291069204e-08, + "logits/chosen": -2.74672794342041, + "logits/rejected": -2.731895923614502, + "logps/chosen": -98.07127380371094, + "logps/rejected": -102.38003540039062, + "loss": 0.6761, + "rewards/accuracies": 0.606249988079071, + "rewards/chosen": -0.4332544803619385, + "rewards/margins": 0.05527599900960922, + "rewards/rejected": -0.4885304570198059, + "step": 3770 + }, + { + "epoch": 0.6512749827705031, + "grad_norm": 14.162087440490723, + "learning_rate": 9.588414754678408e-08, + "logits/chosen": -2.7618002891540527, + "logits/rejected": -2.7311835289001465, + "logps/chosen": -96.56783294677734, + "logps/rejected": -98.6578369140625, + "loss": 0.6607, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -0.4174883961677551, + "rewards/margins": 0.08339640498161316, + "rewards/rejected": -0.5008847713470459, + "step": 3780 + }, + { + "epoch": 0.6529979324603722, + "grad_norm": 11.101541519165039, + "learning_rate": 9.584422775597263e-08, + "logits/chosen": -2.7361607551574707, + "logits/rejected": -2.7133641242980957, + "logps/chosen": -96.97541046142578, + "logps/rejected": -100.37510681152344, + "loss": 0.6607, + "rewards/accuracies": 0.59375, + "rewards/chosen": -0.42009204626083374, + "rewards/margins": 0.08410472422838211, + "rewards/rejected": -0.5041967630386353, + "step": 3790 + }, + { + "epoch": 0.6547208821502413, + "grad_norm": 17.42683219909668, + "learning_rate": 9.58041236987114e-08, + "logits/chosen": -2.758574962615967, + "logits/rejected": -2.738593101501465, + "logps/chosen": -100.40462493896484, + "logps/rejected": -102.95411682128906, + "loss": 0.6594, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -0.4191233217716217, + "rewards/margins": 0.08777258545160294, + "rewards/rejected": -0.506895899772644, + "step": 3800 + }, + { + "epoch": 0.6564438318401102, + "grad_norm": 9.499427795410156, + "learning_rate": 9.576383553619479e-08, + "logits/chosen": -2.774747610092163, + "logits/rejected": -2.7435383796691895, + "logps/chosen": -104.61952209472656, + "logps/rejected": -106.6491928100586, + "loss": 0.6534, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -0.45622339844703674, + "rewards/margins": 0.09755819290876389, + "rewards/rejected": -0.5537816286087036, + "step": 3810 + }, + { + "epoch": 0.6581667815299793, + "grad_norm": 10.711869239807129, + "learning_rate": 9.572336343035719e-08, + "logits/chosen": -2.7234880924224854, + "logits/rejected": -2.7070212364196777, + "logps/chosen": -98.84184265136719, + "logps/rejected": -104.13154602050781, + "loss": 0.661, + "rewards/accuracies": 0.6187499761581421, + "rewards/chosen": -0.43561094999313354, + "rewards/margins": 0.08251044899225235, + "rewards/rejected": -0.5181214213371277, + "step": 3820 + }, + { + "epoch": 0.6598897312198484, + "grad_norm": 10.506270408630371, + "learning_rate": 9.56827075438723e-08, + "logits/chosen": -2.754150867462158, + "logits/rejected": -2.7183361053466797, + "logps/chosen": -103.78630065917969, + "logps/rejected": -102.62027740478516, + "loss": 0.6638, + "rewards/accuracies": 0.606249988079071, + "rewards/chosen": -0.4417910575866699, + "rewards/margins": 0.07783858478069305, + "rewards/rejected": -0.5196296572685242, + "step": 3830 + }, + { + "epoch": 0.6616126809097175, + "grad_norm": 8.079768180847168, + "learning_rate": 9.564186804015257e-08, + "logits/chosen": -2.7156500816345215, + "logits/rejected": -2.7090981006622314, + "logps/chosen": -97.9146728515625, + "logps/rejected": -112.15087890625, + "loss": 0.6478, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.4257539212703705, + "rewards/margins": 0.11482544243335724, + "rewards/rejected": -0.5405794382095337, + "step": 3840 + }, + { + "epoch": 0.6633356305995864, + "grad_norm": 11.12878704071045, + "learning_rate": 9.560084508334842e-08, + "logits/chosen": -2.7950711250305176, + "logits/rejected": -2.783581018447876, + "logps/chosen": -100.24549865722656, + "logps/rejected": -101.98298645019531, + "loss": 0.6671, + "rewards/accuracies": 0.5687500238418579, + "rewards/chosen": -0.4187573790550232, + "rewards/margins": 0.07190145552158356, + "rewards/rejected": -0.49065881967544556, + "step": 3850 + }, + { + "epoch": 0.6650585802894555, + "grad_norm": 11.741887092590332, + "learning_rate": 9.555963883834766e-08, + "logits/chosen": -2.8084073066711426, + "logits/rejected": -2.7839548587799072, + "logps/chosen": -100.43121337890625, + "logps/rejected": -103.94758605957031, + "loss": 0.6682, + "rewards/accuracies": 0.606249988079071, + "rewards/chosen": -0.4456968903541565, + "rewards/margins": 0.07233995944261551, + "rewards/rejected": -0.5180368423461914, + "step": 3860 + }, + { + "epoch": 0.6667815299793246, + "grad_norm": 11.502065658569336, + "learning_rate": 9.551824947077482e-08, + "logits/chosen": -2.736194372177124, + "logits/rejected": -2.720412254333496, + "logps/chosen": -102.61311340332031, + "logps/rejected": -108.13037109375, + "loss": 0.6503, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -0.4265097677707672, + "rewards/margins": 0.10871385037899017, + "rewards/rejected": -0.5352236032485962, + "step": 3870 + }, + { + "epoch": 0.6685044796691937, + "grad_norm": 10.240920066833496, + "learning_rate": 9.54766771469905e-08, + "logits/chosen": -2.7539052963256836, + "logits/rejected": -2.757045269012451, + "logps/chosen": -98.1904067993164, + "logps/rejected": -105.439453125, + "loss": 0.6696, + "rewards/accuracies": 0.5687500238418579, + "rewards/chosen": -0.4628649652004242, + "rewards/margins": 0.06372271478176117, + "rewards/rejected": -0.5265876054763794, + "step": 3880 + }, + { + "epoch": 0.6702274293590628, + "grad_norm": 11.330344200134277, + "learning_rate": 9.54349220340906e-08, + "logits/chosen": -2.805917263031006, + "logits/rejected": -2.7870750427246094, + "logps/chosen": -97.0656509399414, + "logps/rejected": -103.60054016113281, + "loss": 0.6511, + "rewards/accuracies": 0.65625, + "rewards/chosen": -0.4208943247795105, + "rewards/margins": 0.10386069864034653, + "rewards/rejected": -0.5247550010681152, + "step": 3890 + }, + { + "epoch": 0.6719503790489317, + "grad_norm": 9.428828239440918, + "learning_rate": 9.539298429990581e-08, + "logits/chosen": -2.782029628753662, + "logits/rejected": -2.743447780609131, + "logps/chosen": -100.13716125488281, + "logps/rejected": -101.36534118652344, + "loss": 0.6573, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -0.41971486806869507, + "rewards/margins": 0.09659760445356369, + "rewards/rejected": -0.5163124799728394, + "step": 3900 + }, + { + "epoch": 0.6736733287388008, + "grad_norm": 12.973221778869629, + "learning_rate": 9.535086411300076e-08, + "logits/chosen": -2.774634838104248, + "logits/rejected": -2.7620959281921387, + "logps/chosen": -97.77294921875, + "logps/rejected": -107.8936996459961, + "loss": 0.639, + "rewards/accuracies": 0.6187499761581421, + "rewards/chosen": -0.41240349411964417, + "rewards/margins": 0.13535526394844055, + "rewards/rejected": -0.5477586984634399, + "step": 3910 + }, + { + "epoch": 0.6753962784286699, + "grad_norm": 9.753588676452637, + "learning_rate": 9.53085616426735e-08, + "logits/chosen": -2.799535036087036, + "logits/rejected": -2.766714096069336, + "logps/chosen": -98.39482116699219, + "logps/rejected": -98.08587646484375, + "loss": 0.6535, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.3804094195365906, + "rewards/margins": 0.10100064426660538, + "rewards/rejected": -0.48141002655029297, + "step": 3920 + }, + { + "epoch": 0.677119228118539, + "grad_norm": 11.287006378173828, + "learning_rate": 9.526607705895473e-08, + "logits/chosen": -2.7926058769226074, + "logits/rejected": -2.7882137298583984, + "logps/chosen": -94.21101379394531, + "logps/rejected": -100.4034652709961, + "loss": 0.6699, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.4150943160057068, + "rewards/margins": 0.06874460726976395, + "rewards/rejected": -0.48383888602256775, + "step": 3930 + }, + { + "epoch": 0.6788421778084079, + "grad_norm": 9.813089370727539, + "learning_rate": 9.522341053260714e-08, + "logits/chosen": -2.6760735511779785, + "logits/rejected": -2.659942626953125, + "logps/chosen": -95.42730712890625, + "logps/rejected": -101.63002014160156, + "loss": 0.6603, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -0.4058200716972351, + "rewards/margins": 0.08270197361707687, + "rewards/rejected": -0.4885219931602478, + "step": 3940 + }, + { + "epoch": 0.680565127498277, + "grad_norm": 13.817987442016602, + "learning_rate": 9.51805622351247e-08, + "logits/chosen": -2.6872096061706543, + "logits/rejected": -2.6538822650909424, + "logps/chosen": -94.05184936523438, + "logps/rejected": -99.13981628417969, + "loss": 0.6467, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.3803722560405731, + "rewards/margins": 0.11201455444097519, + "rewards/rejected": -0.49238675832748413, + "step": 3950 + }, + { + "epoch": 0.6822880771881461, + "grad_norm": 12.471182823181152, + "learning_rate": 9.513753233873202e-08, + "logits/chosen": -2.7590904235839844, + "logits/rejected": -2.7596237659454346, + "logps/chosen": -95.321044921875, + "logps/rejected": -108.5772933959961, + "loss": 0.6568, + "rewards/accuracies": 0.65625, + "rewards/chosen": -0.4106566905975342, + "rewards/margins": 0.09463075548410416, + "rewards/rejected": -0.5052874684333801, + "step": 3960 + }, + { + "epoch": 0.6840110268780152, + "grad_norm": 11.664698600769043, + "learning_rate": 9.50943210163836e-08, + "logits/chosen": -2.748607635498047, + "logits/rejected": -2.7324156761169434, + "logps/chosen": -98.6547622680664, + "logps/rejected": -105.25838470458984, + "loss": 0.6601, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -0.4345720410346985, + "rewards/margins": 0.08905188739299774, + "rewards/rejected": -0.5236239433288574, + "step": 3970 + }, + { + "epoch": 0.6857339765678843, + "grad_norm": 9.298650741577148, + "learning_rate": 9.505092844176322e-08, + "logits/chosen": -2.687570571899414, + "logits/rejected": -2.681464195251465, + "logps/chosen": -94.56925201416016, + "logps/rejected": -105.93550109863281, + "loss": 0.6414, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.405214786529541, + "rewards/margins": 0.12826186418533325, + "rewards/rejected": -0.5334766507148743, + "step": 3980 + }, + { + "epoch": 0.6874569262577532, + "grad_norm": 8.55325984954834, + "learning_rate": 9.500735478928307e-08, + "logits/chosen": -2.7440967559814453, + "logits/rejected": -2.7317655086517334, + "logps/chosen": -92.8218994140625, + "logps/rejected": -101.8056869506836, + "loss": 0.6584, + "rewards/accuracies": 0.59375, + "rewards/chosen": -0.3859381079673767, + "rewards/margins": 0.0919785350561142, + "rewards/rejected": -0.4779166579246521, + "step": 3990 + }, + { + "epoch": 0.6891798759476223, + "grad_norm": 10.714520454406738, + "learning_rate": 9.496360023408332e-08, + "logits/chosen": -2.7911689281463623, + "logits/rejected": -2.765573263168335, + "logps/chosen": -97.75958251953125, + "logps/rejected": -100.08521270751953, + "loss": 0.6609, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -0.4003545641899109, + "rewards/margins": 0.08431843668222427, + "rewards/rejected": -0.4846729636192322, + "step": 4000 + }, + { + "epoch": 0.6891798759476223, + "eval_logits/chosen": -2.791173219680786, + "eval_logits/rejected": -2.787909984588623, + "eval_logps/chosen": -89.10730743408203, + "eval_logps/rejected": -98.77848815917969, + "eval_loss": 0.6695796251296997, + "eval_rewards/accuracies": 0.6052509546279907, + "eval_rewards/chosen": -0.300918310880661, + "eval_rewards/margins": 0.059370510280132294, + "eval_rewards/rejected": -0.3602888882160187, + "eval_runtime": 383.5897, + "eval_samples_per_second": 11.22, + "eval_steps_per_second": 1.403, + "step": 4000 + }, + { + "epoch": 0.6909028256374914, + "grad_norm": 9.583688735961914, + "learning_rate": 9.491966495203114e-08, + "logits/chosen": -2.717632293701172, + "logits/rejected": -2.7124199867248535, + "logps/chosen": -89.47596740722656, + "logps/rejected": -104.87911224365234, + "loss": 0.6421, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -0.4020870327949524, + "rewards/margins": 0.12617447972297668, + "rewards/rejected": -0.5282614231109619, + "step": 4010 + }, + { + "epoch": 0.6926257753273605, + "grad_norm": 8.319774627685547, + "learning_rate": 9.487554911972019e-08, + "logits/chosen": -2.72869610786438, + "logits/rejected": -2.7254672050476074, + "logps/chosen": -92.48283386230469, + "logps/rejected": -102.42913818359375, + "loss": 0.6523, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.3974384069442749, + "rewards/margins": 0.10052331537008286, + "rewards/rejected": -0.49796175956726074, + "step": 4020 + }, + { + "epoch": 0.6943487250172296, + "grad_norm": 8.953021049499512, + "learning_rate": 9.483125291446976e-08, + "logits/chosen": -2.7326226234436035, + "logits/rejected": -2.7165818214416504, + "logps/chosen": -95.19175720214844, + "logps/rejected": -102.15061950683594, + "loss": 0.6586, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -0.38019615411758423, + "rewards/margins": 0.08821289241313934, + "rewards/rejected": -0.46840906143188477, + "step": 4030 + }, + { + "epoch": 0.6960716747070985, + "grad_norm": 12.53400707244873, + "learning_rate": 9.478677651432421e-08, + "logits/chosen": -2.779245376586914, + "logits/rejected": -2.7705986499786377, + "logps/chosen": -98.64287567138672, + "logps/rejected": -106.11576080322266, + "loss": 0.6537, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.41790658235549927, + "rewards/margins": 0.10335121303796768, + "rewards/rejected": -0.521257758140564, + "step": 4040 + }, + { + "epoch": 0.6977946243969676, + "grad_norm": 9.494404792785645, + "learning_rate": 9.47421200980521e-08, + "logits/chosen": -2.7377359867095947, + "logits/rejected": -2.7219014167785645, + "logps/chosen": -94.80987548828125, + "logps/rejected": -105.46993255615234, + "loss": 0.6493, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -0.4234391748905182, + "rewards/margins": 0.11501254886388779, + "rewards/rejected": -0.5384517312049866, + "step": 4050 + }, + { + "epoch": 0.6995175740868367, + "grad_norm": 9.975348472595215, + "learning_rate": 9.469728384514561e-08, + "logits/chosen": -2.7234580516815186, + "logits/rejected": -2.701939105987549, + "logps/chosen": -102.89959716796875, + "logps/rejected": -105.6712646484375, + "loss": 0.6592, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.456900417804718, + "rewards/margins": 0.08754035085439682, + "rewards/rejected": -0.544440746307373, + "step": 4060 + }, + { + "epoch": 0.7012405237767058, + "grad_norm": 10.78412914276123, + "learning_rate": 9.465226793581974e-08, + "logits/chosen": -2.696413993835449, + "logits/rejected": -2.680673599243164, + "logps/chosen": -98.06929016113281, + "logps/rejected": -109.2234878540039, + "loss": 0.6401, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.44293469190597534, + "rewards/margins": 0.13048727810382843, + "rewards/rejected": -0.5734219551086426, + "step": 4070 + }, + { + "epoch": 0.7029634734665747, + "grad_norm": 13.74826717376709, + "learning_rate": 9.460707255101159e-08, + "logits/chosen": -2.7049059867858887, + "logits/rejected": -2.696500539779663, + "logps/chosen": -99.33048248291016, + "logps/rejected": -108.95695495605469, + "loss": 0.6559, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -0.4602390229701996, + "rewards/margins": 0.09505314379930496, + "rewards/rejected": -0.5552922487258911, + "step": 4080 + }, + { + "epoch": 0.7046864231564438, + "grad_norm": 11.00015640258789, + "learning_rate": 9.456169787237962e-08, + "logits/chosen": -2.7260005474090576, + "logits/rejected": -2.7050795555114746, + "logps/chosen": -103.5158462524414, + "logps/rejected": -112.79571533203125, + "loss": 0.6457, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -0.46710270643234253, + "rewards/margins": 0.1251661330461502, + "rewards/rejected": -0.5922688245773315, + "step": 4090 + }, + { + "epoch": 0.7064093728463129, + "grad_norm": 14.23911190032959, + "learning_rate": 9.451614408230299e-08, + "logits/chosen": -2.7269930839538574, + "logits/rejected": -2.7070746421813965, + "logps/chosen": -105.35322570800781, + "logps/rejected": -111.6051025390625, + "loss": 0.648, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.5135026574134827, + "rewards/margins": 0.11763473600149155, + "rewards/rejected": -0.631137490272522, + "step": 4100 + }, + { + "epoch": 0.708132322536182, + "grad_norm": 12.999456405639648, + "learning_rate": 9.447041136388078e-08, + "logits/chosen": -2.686152696609497, + "logits/rejected": -2.6782896518707275, + "logps/chosen": -114.95936584472656, + "logps/rejected": -113.77604675292969, + "loss": 0.6839, + "rewards/accuracies": 0.5687500238418579, + "rewards/chosen": -0.5828672051429749, + "rewards/margins": 0.04749060794711113, + "rewards/rejected": -0.6303578019142151, + "step": 4110 + }, + { + "epoch": 0.709855272226051, + "grad_norm": 13.190668106079102, + "learning_rate": 9.442449990093124e-08, + "logits/chosen": -2.665459156036377, + "logits/rejected": -2.672152042388916, + "logps/chosen": -106.706787109375, + "logps/rejected": -120.7962875366211, + "loss": 0.6483, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.5548613667488098, + "rewards/margins": 0.11795143038034439, + "rewards/rejected": -0.672812819480896, + "step": 4120 + }, + { + "epoch": 0.71157822191592, + "grad_norm": 15.538352012634277, + "learning_rate": 9.437840987799104e-08, + "logits/chosen": -2.7314820289611816, + "logits/rejected": -2.7162790298461914, + "logps/chosen": -109.08930969238281, + "logps/rejected": -115.02335357666016, + "loss": 0.6564, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -0.5342675447463989, + "rewards/margins": 0.1000477522611618, + "rewards/rejected": -0.6343153715133667, + "step": 4130 + }, + { + "epoch": 0.7133011716057891, + "grad_norm": 13.29371452331543, + "learning_rate": 9.433214148031458e-08, + "logits/chosen": -2.738450050354004, + "logits/rejected": -2.72159481048584, + "logps/chosen": -113.81733703613281, + "logps/rejected": -110.34965515136719, + "loss": 0.6921, + "rewards/accuracies": 0.606249988079071, + "rewards/chosen": -0.566449761390686, + "rewards/margins": 0.03347700834274292, + "rewards/rejected": -0.599926769733429, + "step": 4140 + }, + { + "epoch": 0.7150241212956582, + "grad_norm": 12.126227378845215, + "learning_rate": 9.428569489387324e-08, + "logits/chosen": -2.7552881240844727, + "logits/rejected": -2.739154815673828, + "logps/chosen": -108.66898345947266, + "logps/rejected": -109.2908706665039, + "loss": 0.6604, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.49945053458213806, + "rewards/margins": 0.08895335346460342, + "rewards/rejected": -0.5884039402008057, + "step": 4150 + }, + { + "epoch": 0.7167470709855273, + "grad_norm": 11.490330696105957, + "learning_rate": 9.423907030535459e-08, + "logits/chosen": -2.6959240436553955, + "logits/rejected": -2.672302722930908, + "logps/chosen": -100.10540771484375, + "logps/rejected": -106.18426513671875, + "loss": 0.6493, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.46855083107948303, + "rewards/margins": 0.111080601811409, + "rewards/rejected": -0.579631507396698, + "step": 4160 + }, + { + "epoch": 0.7184700206753962, + "grad_norm": 10.55395793914795, + "learning_rate": 9.419226790216164e-08, + "logits/chosen": -2.7417430877685547, + "logits/rejected": -2.735314130783081, + "logps/chosen": -97.56913757324219, + "logps/rejected": -108.05329895019531, + "loss": 0.6539, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.45069703459739685, + "rewards/margins": 0.10149586200714111, + "rewards/rejected": -0.5521928668022156, + "step": 4170 + }, + { + "epoch": 0.7201929703652653, + "grad_norm": 16.450298309326172, + "learning_rate": 9.414528787241215e-08, + "logits/chosen": -2.7118000984191895, + "logits/rejected": -2.6969642639160156, + "logps/chosen": -102.15837097167969, + "logps/rejected": -113.5875015258789, + "loss": 0.6456, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.45310235023498535, + "rewards/margins": 0.12073644250631332, + "rewards/rejected": -0.5738388299942017, + "step": 4180 + }, + { + "epoch": 0.7219159200551344, + "grad_norm": 12.765628814697266, + "learning_rate": 9.409813040493783e-08, + "logits/chosen": -2.709646463394165, + "logits/rejected": -2.699134111404419, + "logps/chosen": -102.05935668945312, + "logps/rejected": -113.42268371582031, + "loss": 0.6498, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.5007042288780212, + "rewards/margins": 0.11319047212600708, + "rewards/rejected": -0.6138947010040283, + "step": 4190 + }, + { + "epoch": 0.7236388697450035, + "grad_norm": 10.587715148925781, + "learning_rate": 9.405079568928355e-08, + "logits/chosen": -2.738844394683838, + "logits/rejected": -2.7267041206359863, + "logps/chosen": -105.80863952636719, + "logps/rejected": -105.4742431640625, + "loss": 0.6766, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": -0.48726946115493774, + "rewards/margins": 0.053386181592941284, + "rewards/rejected": -0.5406556129455566, + "step": 4200 + }, + { + "epoch": 0.7253618194348725, + "grad_norm": 14.55517292022705, + "learning_rate": 9.400328391570665e-08, + "logits/chosen": -2.7188029289245605, + "logits/rejected": -2.7034573554992676, + "logps/chosen": -106.10347747802734, + "logps/rejected": -108.0635986328125, + "loss": 0.6752, + "rewards/accuracies": 0.581250011920929, + "rewards/chosen": -0.5038017630577087, + "rewards/margins": 0.060950059443712234, + "rewards/rejected": -0.5647518038749695, + "step": 4210 + }, + { + "epoch": 0.7270847691247415, + "grad_norm": 8.819595336914062, + "learning_rate": 9.395559527517611e-08, + "logits/chosen": -2.6296181678771973, + "logits/rejected": -2.6180548667907715, + "logps/chosen": -98.534423828125, + "logps/rejected": -107.92878723144531, + "loss": 0.6527, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.4547065794467926, + "rewards/margins": 0.10374270379543304, + "rewards/rejected": -0.5584492683410645, + "step": 4220 + }, + { + "epoch": 0.7288077188146106, + "grad_norm": 12.428403854370117, + "learning_rate": 9.390772995937181e-08, + "logits/chosen": -2.767120361328125, + "logits/rejected": -2.750506639480591, + "logps/chosen": -105.1033935546875, + "logps/rejected": -111.66764831542969, + "loss": 0.653, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -0.4818612039089203, + "rewards/margins": 0.10422980785369873, + "rewards/rejected": -0.5860909819602966, + "step": 4230 + }, + { + "epoch": 0.7305306685044797, + "grad_norm": 11.653403282165527, + "learning_rate": 9.385968816068377e-08, + "logits/chosen": -2.6813740730285645, + "logits/rejected": -2.6649351119995117, + "logps/chosen": -102.3239517211914, + "logps/rejected": -112.22544860839844, + "loss": 0.6529, + "rewards/accuracies": 0.6187499761581421, + "rewards/chosen": -0.4676525592803955, + "rewards/margins": 0.10822185128927231, + "rewards/rejected": -0.575874388217926, + "step": 4240 + }, + { + "epoch": 0.7322536181943488, + "grad_norm": 12.479633331298828, + "learning_rate": 9.381147007221137e-08, + "logits/chosen": -2.6995177268981934, + "logits/rejected": -2.6860814094543457, + "logps/chosen": -100.09844970703125, + "logps/rejected": -103.00732421875, + "loss": 0.665, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -0.4368916153907776, + "rewards/margins": 0.07490735501050949, + "rewards/rejected": -0.5117989778518677, + "step": 4250 + }, + { + "epoch": 0.7339765678842178, + "grad_norm": 11.699207305908203, + "learning_rate": 9.376307588776258e-08, + "logits/chosen": -2.6861469745635986, + "logits/rejected": -2.671250581741333, + "logps/chosen": -99.37552642822266, + "logps/rejected": -108.71788024902344, + "loss": 0.6508, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -0.4348589777946472, + "rewards/margins": 0.11435357481241226, + "rewards/rejected": -0.5492125749588013, + "step": 4260 + }, + { + "epoch": 0.7356995175740868, + "grad_norm": 13.502720832824707, + "learning_rate": 9.371450580185314e-08, + "logits/chosen": -2.6920905113220215, + "logits/rejected": -2.6746673583984375, + "logps/chosen": -92.69775390625, + "logps/rejected": -99.46900177001953, + "loss": 0.6582, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -0.39805346727371216, + "rewards/margins": 0.09268694370985031, + "rewards/rejected": -0.4907403588294983, + "step": 4270 + }, + { + "epoch": 0.7374224672639559, + "grad_norm": 10.148627281188965, + "learning_rate": 9.366576000970581e-08, + "logits/chosen": -2.689373016357422, + "logits/rejected": -2.674402952194214, + "logps/chosen": -96.89897155761719, + "logps/rejected": -106.751708984375, + "loss": 0.6472, + "rewards/accuracies": 0.65625, + "rewards/chosen": -0.4402523934841156, + "rewards/margins": 0.11912872642278671, + "rewards/rejected": -0.5593811273574829, + "step": 4280 + }, + { + "epoch": 0.739145416953825, + "grad_norm": 13.121967315673828, + "learning_rate": 9.36168387072496e-08, + "logits/chosen": -2.6916027069091797, + "logits/rejected": -2.6764674186706543, + "logps/chosen": -104.63089752197266, + "logps/rejected": -107.55049896240234, + "loss": 0.666, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -0.4880262017250061, + "rewards/margins": 0.08436858654022217, + "rewards/rejected": -0.5723947286605835, + "step": 4290 + }, + { + "epoch": 0.740868366643694, + "grad_norm": 9.410064697265625, + "learning_rate": 9.356774209111899e-08, + "logits/chosen": -2.713801383972168, + "logits/rejected": -2.706458568572998, + "logps/chosen": -99.73240661621094, + "logps/rejected": -108.1336441040039, + "loss": 0.6515, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.45986419916152954, + "rewards/margins": 0.10472697019577026, + "rewards/rejected": -0.5645912289619446, + "step": 4300 + }, + { + "epoch": 0.742591316333563, + "grad_norm": 10.330328941345215, + "learning_rate": 9.351847035865306e-08, + "logits/chosen": -2.6547112464904785, + "logits/rejected": -2.6392548084259033, + "logps/chosen": -103.56089782714844, + "logps/rejected": -110.84979248046875, + "loss": 0.6469, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -0.48212283849716187, + "rewards/margins": 0.11652640253305435, + "rewards/rejected": -0.598649263381958, + "step": 4310 + }, + { + "epoch": 0.7443142660234321, + "grad_norm": 14.346282005310059, + "learning_rate": 9.346902370789482e-08, + "logits/chosen": -2.704272985458374, + "logits/rejected": -2.6856577396392822, + "logps/chosen": -111.2483901977539, + "logps/rejected": -120.86759185791016, + "loss": 0.6321, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -0.5146492719650269, + "rewards/margins": 0.1525907665491104, + "rewards/rejected": -0.6672400236129761, + "step": 4320 + }, + { + "epoch": 0.7460372157133012, + "grad_norm": 10.199442863464355, + "learning_rate": 9.341940233759028e-08, + "logits/chosen": -2.6659274101257324, + "logits/rejected": -2.6443660259246826, + "logps/chosen": -110.65473937988281, + "logps/rejected": -112.12313079833984, + "loss": 0.6677, + "rewards/accuracies": 0.6187499761581421, + "rewards/chosen": -0.5498034954071045, + "rewards/margins": 0.08058689534664154, + "rewards/rejected": -0.6303903460502625, + "step": 4330 + }, + { + "epoch": 0.7477601654031703, + "grad_norm": 13.762100219726562, + "learning_rate": 9.336960644718777e-08, + "logits/chosen": -2.6515679359436035, + "logits/rejected": -2.6344714164733887, + "logps/chosen": -101.73217010498047, + "logps/rejected": -115.3052978515625, + "loss": 0.6392, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -0.5107470750808716, + "rewards/margins": 0.14539656043052673, + "rewards/rejected": -0.6561435461044312, + "step": 4340 + }, + { + "epoch": 0.7494831150930393, + "grad_norm": 18.938304901123047, + "learning_rate": 9.331963623683704e-08, + "logits/chosen": -2.6708264350891113, + "logits/rejected": -2.66340970993042, + "logps/chosen": -101.90260314941406, + "logps/rejected": -113.85909271240234, + "loss": 0.6523, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -0.4957842230796814, + "rewards/margins": 0.11152273416519165, + "rewards/rejected": -0.6073070168495178, + "step": 4350 + }, + { + "epoch": 0.7512060647829083, + "grad_norm": 12.086625099182129, + "learning_rate": 9.326949190738855e-08, + "logits/chosen": -2.7044968605041504, + "logits/rejected": -2.689762592315674, + "logps/chosen": -112.76841735839844, + "logps/rejected": -117.0528335571289, + "loss": 0.6693, + "rewards/accuracies": 0.581250011920929, + "rewards/chosen": -0.5807456970214844, + "rewards/margins": 0.08174094557762146, + "rewards/rejected": -0.6624866724014282, + "step": 4360 + }, + { + "epoch": 0.7529290144727774, + "grad_norm": 13.480952262878418, + "learning_rate": 9.32191736603926e-08, + "logits/chosen": -2.7097010612487793, + "logits/rejected": -2.695061683654785, + "logps/chosen": -107.33966064453125, + "logps/rejected": -117.2765121459961, + "loss": 0.6499, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -0.5145081281661987, + "rewards/margins": 0.11671149730682373, + "rewards/rejected": -0.6312196254730225, + "step": 4370 + }, + { + "epoch": 0.7546519641626465, + "grad_norm": 14.373209953308105, + "learning_rate": 9.316868169809851e-08, + "logits/chosen": -2.732989549636841, + "logits/rejected": -2.7170796394348145, + "logps/chosen": -110.73612213134766, + "logps/rejected": -111.71165466308594, + "loss": 0.6827, + "rewards/accuracies": 0.5375000238418579, + "rewards/chosen": -0.5497013330459595, + "rewards/margins": 0.050543613731861115, + "rewards/rejected": -0.6002449989318848, + "step": 4380 + }, + { + "epoch": 0.7563749138525155, + "grad_norm": 12.738282203674316, + "learning_rate": 9.311801622345386e-08, + "logits/chosen": -2.6796634197235107, + "logits/rejected": -2.6703240871429443, + "logps/chosen": -103.5061264038086, + "logps/rejected": -116.71205139160156, + "loss": 0.6384, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.49565204977989197, + "rewards/margins": 0.14099957048892975, + "rewards/rejected": -0.6366516351699829, + "step": 4390 + }, + { + "epoch": 0.7580978635423845, + "grad_norm": 15.85651969909668, + "learning_rate": 9.306717744010364e-08, + "logits/chosen": -2.699371814727783, + "logits/rejected": -2.6866402626037598, + "logps/chosen": -111.85618591308594, + "logps/rejected": -117.50447082519531, + "loss": 0.6562, + "rewards/accuracies": 0.581250011920929, + "rewards/chosen": -0.5292888879776001, + "rewards/margins": 0.09612816572189331, + "rewards/rejected": -0.6254171133041382, + "step": 4400 + }, + { + "epoch": 0.7580978635423845, + "eval_logits/chosen": -2.7548279762268066, + "eval_logits/rejected": -2.751525640487671, + "eval_logps/chosen": -99.73304748535156, + "eval_logps/rejected": -110.64991760253906, + "eval_loss": 0.6667318344116211, + "eval_rewards/accuracies": 0.598280668258667, + "eval_rewards/chosen": -0.407175749540329, + "eval_rewards/margins": 0.07182740420103073, + "eval_rewards/rejected": -0.47900310158729553, + "eval_runtime": 382.9812, + "eval_samples_per_second": 11.238, + "eval_steps_per_second": 1.405, + "step": 4400 + }, + { + "epoch": 0.7598208132322536, + "grad_norm": 14.734321594238281, + "learning_rate": 9.301616555238942e-08, + "logits/chosen": -2.641871213912964, + "logits/rejected": -2.6299562454223633, + "logps/chosen": -112.2957534790039, + "logps/rejected": -118.21693420410156, + "loss": 0.6678, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -0.5567243099212646, + "rewards/margins": 0.08240054547786713, + "rewards/rejected": -0.639124870300293, + "step": 4410 + }, + { + "epoch": 0.7615437629221227, + "grad_norm": 13.55246639251709, + "learning_rate": 9.296498076534858e-08, + "logits/chosen": -2.7524256706237793, + "logits/rejected": -2.717398166656494, + "logps/chosen": -111.02726745605469, + "logps/rejected": -113.57003021240234, + "loss": 0.6631, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.5495994687080383, + "rewards/margins": 0.08970017731189728, + "rewards/rejected": -0.6392996311187744, + "step": 4420 + }, + { + "epoch": 0.7632667126119917, + "grad_norm": 14.815827369689941, + "learning_rate": 9.291362328471341e-08, + "logits/chosen": -2.6723484992980957, + "logits/rejected": -2.647158622741699, + "logps/chosen": -106.09244537353516, + "logps/rejected": -112.78398132324219, + "loss": 0.6552, + "rewards/accuracies": 0.59375, + "rewards/chosen": -0.5166307091712952, + "rewards/margins": 0.10847940295934677, + "rewards/rejected": -0.6251100301742554, + "step": 4430 + }, + { + "epoch": 0.7649896623018608, + "grad_norm": 13.4649076461792, + "learning_rate": 9.286209331691037e-08, + "logits/chosen": -2.705343246459961, + "logits/rejected": -2.6825528144836426, + "logps/chosen": -113.4854736328125, + "logps/rejected": -121.1431884765625, + "loss": 0.6418, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.5344313979148865, + "rewards/margins": 0.1367706060409546, + "rewards/rejected": -0.6712020039558411, + "step": 4440 + }, + { + "epoch": 0.7667126119917298, + "grad_norm": 12.157050132751465, + "learning_rate": 9.281039106905916e-08, + "logits/chosen": -2.6409859657287598, + "logits/rejected": -2.6314549446105957, + "logps/chosen": -107.29130554199219, + "logps/rejected": -113.2594985961914, + "loss": 0.6558, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -0.5019858479499817, + "rewards/margins": 0.10079322010278702, + "rewards/rejected": -0.6027790307998657, + "step": 4450 + }, + { + "epoch": 0.7684355616815989, + "grad_norm": 10.632394790649414, + "learning_rate": 9.275851674897203e-08, + "logits/chosen": -2.711153507232666, + "logits/rejected": -2.704730749130249, + "logps/chosen": -99.85983276367188, + "logps/rejected": -111.9957504272461, + "loss": 0.6438, + "rewards/accuracies": 0.65625, + "rewards/chosen": -0.4664650559425354, + "rewards/margins": 0.1344144493341446, + "rewards/rejected": -0.6008794903755188, + "step": 4460 + }, + { + "epoch": 0.770158511371468, + "grad_norm": 11.28720474243164, + "learning_rate": 9.270647056515275e-08, + "logits/chosen": -2.7313551902770996, + "logits/rejected": -2.709290027618408, + "logps/chosen": -104.88045501708984, + "logps/rejected": -109.1452407836914, + "loss": 0.6513, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -0.4840072989463806, + "rewards/margins": 0.11140353977680206, + "rewards/rejected": -0.5954108238220215, + "step": 4470 + }, + { + "epoch": 0.771881461061337, + "grad_norm": 14.331686019897461, + "learning_rate": 9.265425272679596e-08, + "logits/chosen": -2.7455172538757324, + "logits/rejected": -2.7380411624908447, + "logps/chosen": -104.9108657836914, + "logps/rejected": -113.90400695800781, + "loss": 0.659, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -0.5284455418586731, + "rewards/margins": 0.09997548907995224, + "rewards/rejected": -0.6284209489822388, + "step": 4480 + }, + { + "epoch": 0.7736044107512061, + "grad_norm": 12.461320877075195, + "learning_rate": 9.260186344378623e-08, + "logits/chosen": -2.671069622039795, + "logits/rejected": -2.648007869720459, + "logps/chosen": -101.6649169921875, + "logps/rejected": -108.2196044921875, + "loss": 0.6586, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -0.4936288893222809, + "rewards/margins": 0.09367899596691132, + "rewards/rejected": -0.5873079299926758, + "step": 4490 + }, + { + "epoch": 0.7753273604410751, + "grad_norm": 16.6155948638916, + "learning_rate": 9.254930292669723e-08, + "logits/chosen": -2.6818337440490723, + "logits/rejected": -2.6689093112945557, + "logps/chosen": -108.67326354980469, + "logps/rejected": -112.25102233886719, + "loss": 0.6617, + "rewards/accuracies": 0.6187499761581421, + "rewards/chosen": -0.5086537599563599, + "rewards/margins": 0.09293018281459808, + "rewards/rejected": -0.6015838384628296, + "step": 4500 + }, + { + "epoch": 0.7770503101309442, + "grad_norm": 10.590887069702148, + "learning_rate": 9.249657138679084e-08, + "logits/chosen": -2.7585530281066895, + "logits/rejected": -2.7360479831695557, + "logps/chosen": -104.07576751708984, + "logps/rejected": -118.41740417480469, + "loss": 0.6364, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -0.4872332215309143, + "rewards/margins": 0.14657996594905853, + "rewards/rejected": -0.633813202381134, + "step": 4510 + }, + { + "epoch": 0.7787732598208132, + "grad_norm": 11.587627410888672, + "learning_rate": 9.244366903601644e-08, + "logits/chosen": -2.7139852046966553, + "logits/rejected": -2.6992831230163574, + "logps/chosen": -108.73707580566406, + "logps/rejected": -113.06243896484375, + "loss": 0.6659, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -0.552254855632782, + "rewards/margins": 0.0821274071931839, + "rewards/rejected": -0.6343822479248047, + "step": 4520 + }, + { + "epoch": 0.7804962095106823, + "grad_norm": 13.454439163208008, + "learning_rate": 9.239059608700992e-08, + "logits/chosen": -2.697965145111084, + "logits/rejected": -2.689426898956299, + "logps/chosen": -105.71885681152344, + "logps/rejected": -111.27046203613281, + "loss": 0.671, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.5099459886550903, + "rewards/margins": 0.07720411568880081, + "rewards/rejected": -0.5871500968933105, + "step": 4530 + }, + { + "epoch": 0.7822191592005513, + "grad_norm": 13.542638778686523, + "learning_rate": 9.233735275309287e-08, + "logits/chosen": -2.6490254402160645, + "logits/rejected": -2.635833501815796, + "logps/chosen": -102.23767852783203, + "logps/rejected": -109.13404846191406, + "loss": 0.6522, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -0.48313722014427185, + "rewards/margins": 0.1087016835808754, + "rewards/rejected": -0.5918388962745667, + "step": 4540 + }, + { + "epoch": 0.7839421088904204, + "grad_norm": 17.139244079589844, + "learning_rate": 9.228393924827173e-08, + "logits/chosen": -2.7150959968566895, + "logits/rejected": -2.7003800868988037, + "logps/chosen": -107.5324935913086, + "logps/rejected": -113.44700622558594, + "loss": 0.6557, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -0.49557629227638245, + "rewards/margins": 0.10561362653970718, + "rewards/rejected": -0.6011899709701538, + "step": 4550 + }, + { + "epoch": 0.7856650585802895, + "grad_norm": 13.037927627563477, + "learning_rate": 9.223035578723695e-08, + "logits/chosen": -2.651930570602417, + "logits/rejected": -2.6261329650878906, + "logps/chosen": -106.41754150390625, + "logps/rejected": -117.70652770996094, + "loss": 0.6305, + "rewards/accuracies": 0.71875, + "rewards/chosen": -0.4938697814941406, + "rewards/margins": 0.16125063598155975, + "rewards/rejected": -0.6551204323768616, + "step": 4560 + }, + { + "epoch": 0.7873880082701585, + "grad_norm": 17.923662185668945, + "learning_rate": 9.217660258536204e-08, + "logits/chosen": -2.6618800163269043, + "logits/rejected": -2.64208984375, + "logps/chosen": -107.5634994506836, + "logps/rejected": -117.63850402832031, + "loss": 0.6547, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -0.5490785837173462, + "rewards/margins": 0.11649944633245468, + "rewards/rejected": -0.6655780076980591, + "step": 4570 + }, + { + "epoch": 0.7891109579600276, + "grad_norm": 13.755855560302734, + "learning_rate": 9.212267985870285e-08, + "logits/chosen": -2.6529924869537354, + "logits/rejected": -2.632174015045166, + "logps/chosen": -100.74827575683594, + "logps/rejected": -108.7590103149414, + "loss": 0.6465, + "rewards/accuracies": 0.65625, + "rewards/chosen": -0.45046645402908325, + "rewards/margins": 0.12104056030511856, + "rewards/rejected": -0.5715069770812988, + "step": 4580 + }, + { + "epoch": 0.7908339076498966, + "grad_norm": 17.974578857421875, + "learning_rate": 9.206858782399655e-08, + "logits/chosen": -2.7094898223876953, + "logits/rejected": -2.6892411708831787, + "logps/chosen": -111.24505615234375, + "logps/rejected": -114.80848693847656, + "loss": 0.6729, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.5568535923957825, + "rewards/margins": 0.07443558424711227, + "rewards/rejected": -0.6312891840934753, + "step": 4590 + }, + { + "epoch": 0.7925568573397657, + "grad_norm": 11.914977073669434, + "learning_rate": 9.201432669866086e-08, + "logits/chosen": -2.6332201957702637, + "logits/rejected": -2.617691993713379, + "logps/chosen": -107.5090560913086, + "logps/rejected": -125.3719482421875, + "loss": 0.6198, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.5328209400177002, + "rewards/margins": 0.18362154066562653, + "rewards/rejected": -0.7164424657821655, + "step": 4600 + }, + { + "epoch": 0.7942798070296347, + "grad_norm": 13.614599227905273, + "learning_rate": 9.195989670079314e-08, + "logits/chosen": -2.654611587524414, + "logits/rejected": -2.647090435028076, + "logps/chosen": -113.46919250488281, + "logps/rejected": -117.45033264160156, + "loss": 0.6792, + "rewards/accuracies": 0.53125, + "rewards/chosen": -0.6137629151344299, + "rewards/margins": 0.0618717297911644, + "rewards/rejected": -0.6756345629692078, + "step": 4610 + }, + { + "epoch": 0.7960027567195038, + "grad_norm": 18.413545608520508, + "learning_rate": 9.190529804916952e-08, + "logits/chosen": -2.7000679969787598, + "logits/rejected": -2.679298162460327, + "logps/chosen": -110.68448638916016, + "logps/rejected": -121.6512680053711, + "loss": 0.6427, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.5395559072494507, + "rewards/margins": 0.12529203295707703, + "rewards/rejected": -0.6648479104042053, + "step": 4620 + }, + { + "epoch": 0.7977257064093728, + "grad_norm": 13.928332328796387, + "learning_rate": 9.1850530963244e-08, + "logits/chosen": -2.7029757499694824, + "logits/rejected": -2.677156686782837, + "logps/chosen": -115.1556396484375, + "logps/rejected": -125.21661376953125, + "loss": 0.6445, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -0.5706117153167725, + "rewards/margins": 0.1268078088760376, + "rewards/rejected": -0.6974195241928101, + "step": 4630 + }, + { + "epoch": 0.7994486560992419, + "grad_norm": 11.631983757019043, + "learning_rate": 9.179559566314761e-08, + "logits/chosen": -2.693277359008789, + "logits/rejected": -2.685077667236328, + "logps/chosen": -115.26069641113281, + "logps/rejected": -125.75660705566406, + "loss": 0.6575, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": -0.6026903390884399, + "rewards/margins": 0.10752693563699722, + "rewards/rejected": -0.7102171182632446, + "step": 4640 + }, + { + "epoch": 0.801171605789111, + "grad_norm": 13.562352180480957, + "learning_rate": 9.174049236968749e-08, + "logits/chosen": -2.6991212368011475, + "logits/rejected": -2.676464557647705, + "logps/chosen": -111.49564361572266, + "logps/rejected": -118.50048828125, + "loss": 0.6501, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -0.5621426701545715, + "rewards/margins": 0.11947949975728989, + "rewards/rejected": -0.6816221475601196, + "step": 4650 + }, + { + "epoch": 0.80289455547898, + "grad_norm": 21.77356719970703, + "learning_rate": 9.168522130434598e-08, + "logits/chosen": -2.6700754165649414, + "logits/rejected": -2.6562094688415527, + "logps/chosen": -107.92274475097656, + "logps/rejected": -114.88655853271484, + "loss": 0.6583, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.5256450772285461, + "rewards/margins": 0.09301182627677917, + "rewards/rejected": -0.6186568737030029, + "step": 4660 + }, + { + "epoch": 0.8046175051688491, + "grad_norm": 13.408563613891602, + "learning_rate": 9.162978268927982e-08, + "logits/chosen": -2.6990115642547607, + "logits/rejected": -2.682617664337158, + "logps/chosen": -105.44071197509766, + "logps/rejected": -113.16312408447266, + "loss": 0.6446, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -0.5250687599182129, + "rewards/margins": 0.12488999217748642, + "rewards/rejected": -0.6499587297439575, + "step": 4670 + }, + { + "epoch": 0.8063404548587181, + "grad_norm": 13.76351547241211, + "learning_rate": 9.157417674731917e-08, + "logits/chosen": -2.6879332065582275, + "logits/rejected": -2.6660237312316895, + "logps/chosen": -110.22966003417969, + "logps/rejected": -118.12898254394531, + "loss": 0.6602, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.5635314583778381, + "rewards/margins": 0.10251311212778091, + "rewards/rejected": -0.6660445332527161, + "step": 4680 + }, + { + "epoch": 0.8080634045485872, + "grad_norm": 13.860352516174316, + "learning_rate": 9.151840370196677e-08, + "logits/chosen": -2.7110648155212402, + "logits/rejected": -2.69399356842041, + "logps/chosen": -114.106689453125, + "logps/rejected": -126.93440246582031, + "loss": 0.6373, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -0.5762235522270203, + "rewards/margins": 0.15776102244853973, + "rewards/rejected": -0.7339845895767212, + "step": 4690 + }, + { + "epoch": 0.8097863542384562, + "grad_norm": 16.746814727783203, + "learning_rate": 9.146246377739695e-08, + "logits/chosen": -2.6945366859436035, + "logits/rejected": -2.6857993602752686, + "logps/chosen": -112.3055191040039, + "logps/rejected": -128.11129760742188, + "loss": 0.6441, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -0.5938907861709595, + "rewards/margins": 0.14227768778800964, + "rewards/rejected": -0.7361685037612915, + "step": 4700 + }, + { + "epoch": 0.8115093039283253, + "grad_norm": 15.643275260925293, + "learning_rate": 9.140635719845486e-08, + "logits/chosen": -2.723626136779785, + "logits/rejected": -2.6965630054473877, + "logps/chosen": -115.5739974975586, + "logps/rejected": -123.12554931640625, + "loss": 0.6351, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.5669673681259155, + "rewards/margins": 0.14785338938236237, + "rewards/rejected": -0.7148207426071167, + "step": 4710 + }, + { + "epoch": 0.8132322536181944, + "grad_norm": 14.575557708740234, + "learning_rate": 9.135008419065549e-08, + "logits/chosen": -2.6363587379455566, + "logits/rejected": -2.6148293018341064, + "logps/chosen": -116.8868408203125, + "logps/rejected": -129.30899047851562, + "loss": 0.624, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -0.5912224054336548, + "rewards/margins": 0.18092842400074005, + "rewards/rejected": -0.7721508145332336, + "step": 4720 + }, + { + "epoch": 0.8149552033080634, + "grad_norm": 16.16574478149414, + "learning_rate": 9.129364498018274e-08, + "logits/chosen": -2.6242127418518066, + "logits/rejected": -2.604121446609497, + "logps/chosen": -119.1768798828125, + "logps/rejected": -125.98077392578125, + "loss": 0.6594, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -0.642062783241272, + "rewards/margins": 0.11331970989704132, + "rewards/rejected": -0.7553825378417969, + "step": 4730 + }, + { + "epoch": 0.8166781529979324, + "grad_norm": 15.485376358032227, + "learning_rate": 9.12370397938886e-08, + "logits/chosen": -2.6874778270721436, + "logits/rejected": -2.6816353797912598, + "logps/chosen": -110.1086196899414, + "logps/rejected": -124.9239730834961, + "loss": 0.6253, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.5432339906692505, + "rewards/margins": 0.1717141717672348, + "rewards/rejected": -0.7149480581283569, + "step": 4740 + }, + { + "epoch": 0.8184011026878015, + "grad_norm": 13.701244354248047, + "learning_rate": 9.118026885929214e-08, + "logits/chosen": -2.6680829524993896, + "logits/rejected": -2.6603760719299316, + "logps/chosen": -114.32022857666016, + "logps/rejected": -121.4356918334961, + "loss": 0.6575, + "rewards/accuracies": 0.606249988079071, + "rewards/chosen": -0.5776886343955994, + "rewards/margins": 0.09786656498908997, + "rewards/rejected": -0.6755552887916565, + "step": 4750 + }, + { + "epoch": 0.8201240523776706, + "grad_norm": 15.894256591796875, + "learning_rate": 9.112333240457866e-08, + "logits/chosen": -2.6610710620880127, + "logits/rejected": -2.6477136611938477, + "logps/chosen": -117.54109954833984, + "logps/rejected": -126.5459213256836, + "loss": 0.6537, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -0.6480070352554321, + "rewards/margins": 0.11084611713886261, + "rewards/rejected": -0.7588531374931335, + "step": 4760 + }, + { + "epoch": 0.8218470020675396, + "grad_norm": 15.977734565734863, + "learning_rate": 9.106623065859873e-08, + "logits/chosen": -2.703068971633911, + "logits/rejected": -2.6900250911712646, + "logps/chosen": -128.57907104492188, + "logps/rejected": -133.2577362060547, + "loss": 0.661, + "rewards/accuracies": 0.59375, + "rewards/chosen": -0.6808390021324158, + "rewards/margins": 0.11281619220972061, + "rewards/rejected": -0.793655276298523, + "step": 4770 + }, + { + "epoch": 0.8235699517574087, + "grad_norm": 15.172703742980957, + "learning_rate": 9.100896385086731e-08, + "logits/chosen": -2.6161365509033203, + "logits/rejected": -2.6057090759277344, + "logps/chosen": -117.88389587402344, + "logps/rejected": -132.16099548339844, + "loss": 0.637, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -0.6521201729774475, + "rewards/margins": 0.14359167218208313, + "rewards/rejected": -0.795711874961853, + "step": 4780 + }, + { + "epoch": 0.8252929014472777, + "grad_norm": 12.677447319030762, + "learning_rate": 9.095153221156283e-08, + "logits/chosen": -2.668921709060669, + "logits/rejected": -2.6519389152526855, + "logps/chosen": -127.98905944824219, + "logps/rejected": -126.518310546875, + "loss": 0.6862, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.6938291192054749, + "rewards/margins": 0.050543297082185745, + "rewards/rejected": -0.7443724274635315, + "step": 4790 + }, + { + "epoch": 0.8270158511371468, + "grad_norm": 11.560530662536621, + "learning_rate": 9.089393597152619e-08, + "logits/chosen": -2.6407485008239746, + "logits/rejected": -2.633110761642456, + "logps/chosen": -114.29498291015625, + "logps/rejected": -122.00848388671875, + "loss": 0.6569, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -0.6159349679946899, + "rewards/margins": 0.10554057359695435, + "rewards/rejected": -0.7214756011962891, + "step": 4800 + }, + { + "epoch": 0.8270158511371468, + "eval_logits/chosen": -2.731581687927246, + "eval_logits/rejected": -2.7282795906066895, + "eval_logps/chosen": -108.52731323242188, + "eval_logps/rejected": -120.57421875, + "eval_loss": 0.6637259721755981, + "eval_rewards/accuracies": 0.6059479713439941, + "eval_rewards/chosen": -0.49511826038360596, + "eval_rewards/margins": 0.08312792330980301, + "eval_rewards/rejected": -0.5782462358474731, + "eval_runtime": 383.3031, + "eval_samples_per_second": 11.229, + "eval_steps_per_second": 1.404, + "step": 4800 + }, + { + "epoch": 0.8287388008270159, + "grad_norm": 12.14091682434082, + "learning_rate": 9.083617536225994e-08, + "logits/chosen": -2.6535537242889404, + "logits/rejected": -2.6268935203552246, + "logps/chosen": -118.03663635253906, + "logps/rejected": -123.8675765991211, + "loss": 0.6429, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.5971187353134155, + "rewards/margins": 0.13232539594173431, + "rewards/rejected": -0.729444146156311, + "step": 4810 + }, + { + "epoch": 0.8304617505168849, + "grad_norm": 13.891697883605957, + "learning_rate": 9.077825061592729e-08, + "logits/chosen": -2.6767055988311768, + "logits/rejected": -2.670660972595215, + "logps/chosen": -114.75425720214844, + "logps/rejected": -124.70011138916016, + "loss": 0.6553, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": -0.6179844737052917, + "rewards/margins": 0.11821053922176361, + "rewards/rejected": -0.7361949682235718, + "step": 4820 + }, + { + "epoch": 0.832184700206754, + "grad_norm": 14.244720458984375, + "learning_rate": 9.072016196535112e-08, + "logits/chosen": -2.6639866828918457, + "logits/rejected": -2.6526284217834473, + "logps/chosen": -115.40333557128906, + "logps/rejected": -121.971435546875, + "loss": 0.664, + "rewards/accuracies": 0.59375, + "rewards/chosen": -0.6033909320831299, + "rewards/margins": 0.0941086858510971, + "rewards/rejected": -0.6974996328353882, + "step": 4830 + }, + { + "epoch": 0.833907649896623, + "grad_norm": 13.105058670043945, + "learning_rate": 9.066190964401321e-08, + "logits/chosen": -2.6423277854919434, + "logits/rejected": -2.619694232940674, + "logps/chosen": -121.77622985839844, + "logps/rejected": -130.93408203125, + "loss": 0.6435, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -0.6425229907035828, + "rewards/margins": 0.1487221121788025, + "rewards/rejected": -0.7912451028823853, + "step": 4840 + }, + { + "epoch": 0.8356305995864921, + "grad_norm": 14.29844856262207, + "learning_rate": 9.060349388605313e-08, + "logits/chosen": -2.6907999515533447, + "logits/rejected": -2.6778297424316406, + "logps/chosen": -113.85269927978516, + "logps/rejected": -126.59355163574219, + "loss": 0.6355, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -0.5850516557693481, + "rewards/margins": 0.1524309366941452, + "rewards/rejected": -0.7374826073646545, + "step": 4850 + }, + { + "epoch": 0.8373535492763611, + "grad_norm": 20.311260223388672, + "learning_rate": 9.054491492626736e-08, + "logits/chosen": -2.683197021484375, + "logits/rejected": -2.656275987625122, + "logps/chosen": -124.2918930053711, + "logps/rejected": -120.60951232910156, + "loss": 0.6815, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": -0.6485036015510559, + "rewards/margins": 0.058196693658828735, + "rewards/rejected": -0.7067002654075623, + "step": 4860 + }, + { + "epoch": 0.8390764989662302, + "grad_norm": 16.1400203704834, + "learning_rate": 9.048617300010839e-08, + "logits/chosen": -2.708627939224243, + "logits/rejected": -2.689025402069092, + "logps/chosen": -121.8497543334961, + "logps/rejected": -129.84942626953125, + "loss": 0.6372, + "rewards/accuracies": 0.65625, + "rewards/chosen": -0.6298328638076782, + "rewards/margins": 0.15910618007183075, + "rewards/rejected": -0.7889389991760254, + "step": 4870 + }, + { + "epoch": 0.8407994486560992, + "grad_norm": 16.34994125366211, + "learning_rate": 9.042726834368372e-08, + "logits/chosen": -2.634161949157715, + "logits/rejected": -2.6114230155944824, + "logps/chosen": -116.34796142578125, + "logps/rejected": -122.90814208984375, + "loss": 0.66, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -0.6260043978691101, + "rewards/margins": 0.10967379808425903, + "rewards/rejected": -0.7356782555580139, + "step": 4880 + }, + { + "epoch": 0.8425223983459683, + "grad_norm": 16.96595573425293, + "learning_rate": 9.036820119375494e-08, + "logits/chosen": -2.6891283988952637, + "logits/rejected": -2.6712119579315186, + "logps/chosen": -117.00113677978516, + "logps/rejected": -132.6031494140625, + "loss": 0.6201, + "rewards/accuracies": 0.71875, + "rewards/chosen": -0.6106215119361877, + "rewards/margins": 0.18790537118911743, + "rewards/rejected": -0.7985268831253052, + "step": 4890 + }, + { + "epoch": 0.8442453480358374, + "grad_norm": 17.874977111816406, + "learning_rate": 9.030897178773676e-08, + "logits/chosen": -2.642887592315674, + "logits/rejected": -2.6208043098449707, + "logps/chosen": -115.6777572631836, + "logps/rejected": -121.92088317871094, + "loss": 0.6622, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.6188615560531616, + "rewards/margins": 0.10674705356359482, + "rewards/rejected": -0.7256086468696594, + "step": 4900 + }, + { + "epoch": 0.8459682977257064, + "grad_norm": 17.601911544799805, + "learning_rate": 9.024958036369604e-08, + "logits/chosen": -2.74424409866333, + "logits/rejected": -2.7178235054016113, + "logps/chosen": -116.89372253417969, + "logps/rejected": -123.71903228759766, + "loss": 0.6526, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -0.5964015126228333, + "rewards/margins": 0.124220110476017, + "rewards/rejected": -0.7206215262413025, + "step": 4910 + }, + { + "epoch": 0.8476912474155754, + "grad_norm": 22.39488983154297, + "learning_rate": 9.019002716035091e-08, + "logits/chosen": -2.6259522438049316, + "logits/rejected": -2.612258195877075, + "logps/chosen": -112.20829010009766, + "logps/rejected": -125.56230163574219, + "loss": 0.6375, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -0.5727871060371399, + "rewards/margins": 0.1611333191394806, + "rewards/rejected": -0.7339202761650085, + "step": 4920 + }, + { + "epoch": 0.8494141971054445, + "grad_norm": 17.15035057067871, + "learning_rate": 9.013031241706971e-08, + "logits/chosen": -2.741135358810425, + "logits/rejected": -2.7288200855255127, + "logps/chosen": -115.10804748535156, + "logps/rejected": -127.43843078613281, + "loss": 0.6655, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -0.6084795594215393, + "rewards/margins": 0.09732518345117569, + "rewards/rejected": -0.705804705619812, + "step": 4930 + }, + { + "epoch": 0.8511371467953136, + "grad_norm": 21.132137298583984, + "learning_rate": 9.007043637387009e-08, + "logits/chosen": -2.705995798110962, + "logits/rejected": -2.6840262413024902, + "logps/chosen": -112.5478286743164, + "logps/rejected": -121.37569427490234, + "loss": 0.6437, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -0.5596413612365723, + "rewards/margins": 0.1397586166858673, + "rewards/rejected": -0.699400007724762, + "step": 4940 + }, + { + "epoch": 0.8528600964851827, + "grad_norm": 16.019283294677734, + "learning_rate": 9.001039927141802e-08, + "logits/chosen": -2.5744900703430176, + "logits/rejected": -2.558807134628296, + "logps/chosen": -108.00617980957031, + "logps/rejected": -119.18704986572266, + "loss": 0.6459, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -0.5191596150398254, + "rewards/margins": 0.1343960464000702, + "rewards/rejected": -0.6535556316375732, + "step": 4950 + }, + { + "epoch": 0.8545830461750517, + "grad_norm": 20.813732147216797, + "learning_rate": 8.995020135102685e-08, + "logits/chosen": -2.6362557411193848, + "logits/rejected": -2.641608476638794, + "logps/chosen": -108.81939697265625, + "logps/rejected": -129.12559509277344, + "loss": 0.6246, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -0.5596833229064941, + "rewards/margins": 0.1810264140367508, + "rewards/rejected": -0.7407097816467285, + "step": 4960 + }, + { + "epoch": 0.8563059958649207, + "grad_norm": 14.945688247680664, + "learning_rate": 8.988984285465631e-08, + "logits/chosen": -2.620194911956787, + "logits/rejected": -2.6145381927490234, + "logps/chosen": -112.78028869628906, + "logps/rejected": -125.85223388671875, + "loss": 0.6408, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -0.600356936454773, + "rewards/margins": 0.14802278578281403, + "rewards/rejected": -0.7483798265457153, + "step": 4970 + }, + { + "epoch": 0.8580289455547898, + "grad_norm": 15.188470840454102, + "learning_rate": 8.982932402491154e-08, + "logits/chosen": -2.659909725189209, + "logits/rejected": -2.656132698059082, + "logps/chosen": -115.32022857666016, + "logps/rejected": -127.0588607788086, + "loss": 0.6508, + "rewards/accuracies": 0.59375, + "rewards/chosen": -0.6081004738807678, + "rewards/margins": 0.1243026852607727, + "rewards/rejected": -0.7324031591415405, + "step": 4980 + }, + { + "epoch": 0.8597518952446589, + "grad_norm": 20.90500831604004, + "learning_rate": 8.976864510504217e-08, + "logits/chosen": -2.6207499504089355, + "logits/rejected": -2.615877151489258, + "logps/chosen": -115.70672607421875, + "logps/rejected": -133.36627197265625, + "loss": 0.635, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.631032407283783, + "rewards/margins": 0.15860576927661896, + "rewards/rejected": -0.7896381616592407, + "step": 4990 + }, + { + "epoch": 0.8614748449345279, + "grad_norm": 18.265804290771484, + "learning_rate": 8.970780633894122e-08, + "logits/chosen": -2.6352665424346924, + "logits/rejected": -2.6178627014160156, + "logps/chosen": -118.26082611083984, + "logps/rejected": -130.56179809570312, + "loss": 0.6308, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.6121785640716553, + "rewards/margins": 0.17027755081653595, + "rewards/rejected": -0.78245609998703, + "step": 5000 + }, + { + "epoch": 0.8631977946243969, + "grad_norm": 19.394807815551758, + "learning_rate": 8.964680797114426e-08, + "logits/chosen": -2.6262760162353516, + "logits/rejected": -2.6074700355529785, + "logps/chosen": -121.80794525146484, + "logps/rejected": -132.30230712890625, + "loss": 0.6437, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -0.6812965869903564, + "rewards/margins": 0.1452482044696808, + "rewards/rejected": -0.8265447616577148, + "step": 5010 + }, + { + "epoch": 0.864920744314266, + "grad_norm": 12.27147388458252, + "learning_rate": 8.958565024682836e-08, + "logits/chosen": -2.6230664253234863, + "logits/rejected": -2.6053595542907715, + "logps/chosen": -116.36280822753906, + "logps/rejected": -130.62164306640625, + "loss": 0.6335, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -0.6199180483818054, + "rewards/margins": 0.16995711624622345, + "rewards/rejected": -0.7898751497268677, + "step": 5020 + }, + { + "epoch": 0.8666436940041351, + "grad_norm": 20.50470733642578, + "learning_rate": 8.952433341181107e-08, + "logits/chosen": -2.6090950965881348, + "logits/rejected": -2.6000587940216064, + "logps/chosen": -120.79141998291016, + "logps/rejected": -133.48959350585938, + "loss": 0.6455, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -0.6697946786880493, + "rewards/margins": 0.13562212884426117, + "rewards/rejected": -0.8054167628288269, + "step": 5030 + }, + { + "epoch": 0.8683666436940042, + "grad_norm": 20.943683624267578, + "learning_rate": 8.946285771254948e-08, + "logits/chosen": -2.735042095184326, + "logits/rejected": -2.705932855606079, + "logps/chosen": -125.88616943359375, + "logps/rejected": -130.8765106201172, + "loss": 0.6496, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -0.6771482229232788, + "rewards/margins": 0.13066670298576355, + "rewards/rejected": -0.80781489610672, + "step": 5040 + }, + { + "epoch": 0.8700895933838731, + "grad_norm": 14.9611177444458, + "learning_rate": 8.940122339613927e-08, + "logits/chosen": -2.649893045425415, + "logits/rejected": -2.6332714557647705, + "logps/chosen": -124.31199645996094, + "logps/rejected": -136.4893035888672, + "loss": 0.6442, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.6720517873764038, + "rewards/margins": 0.14682091772556305, + "rewards/rejected": -0.8188726305961609, + "step": 5050 + }, + { + "epoch": 0.8718125430737422, + "grad_norm": 17.631919860839844, + "learning_rate": 8.933943071031359e-08, + "logits/chosen": -2.5778708457946777, + "logits/rejected": -2.573026657104492, + "logps/chosen": -118.61930084228516, + "logps/rejected": -129.90167236328125, + "loss": 0.663, + "rewards/accuracies": 0.606249988079071, + "rewards/chosen": -0.6789823770523071, + "rewards/margins": 0.1023738831281662, + "rewards/rejected": -0.7813562154769897, + "step": 5060 + }, + { + "epoch": 0.8735354927636113, + "grad_norm": 14.673048973083496, + "learning_rate": 8.92774799034422e-08, + "logits/chosen": -2.619852066040039, + "logits/rejected": -2.60081148147583, + "logps/chosen": -121.1645278930664, + "logps/rejected": -126.4146499633789, + "loss": 0.6594, + "rewards/accuracies": 0.6187499761581421, + "rewards/chosen": -0.6706386208534241, + "rewards/margins": 0.12023203074932098, + "rewards/rejected": -0.790870726108551, + "step": 5070 + }, + { + "epoch": 0.8752584424534804, + "grad_norm": 15.462677001953125, + "learning_rate": 8.921537122453037e-08, + "logits/chosen": -2.784825086593628, + "logits/rejected": -2.755561351776123, + "logps/chosen": -116.18212890625, + "logps/rejected": -121.95218658447266, + "loss": 0.6467, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.5815830230712891, + "rewards/margins": 0.13263188302516937, + "rewards/rejected": -0.7142149209976196, + "step": 5080 + }, + { + "epoch": 0.8769813921433495, + "grad_norm": 16.479412078857422, + "learning_rate": 8.915310492321799e-08, + "logits/chosen": -2.6742799282073975, + "logits/rejected": -2.6519150733947754, + "logps/chosen": -112.9342041015625, + "logps/rejected": -128.68777465820312, + "loss": 0.6148, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.5782161951065063, + "rewards/margins": 0.20093123614788055, + "rewards/rejected": -0.7791474461555481, + "step": 5090 + }, + { + "epoch": 0.8787043418332184, + "grad_norm": 28.277772903442383, + "learning_rate": 8.909068124977839e-08, + "logits/chosen": -2.5798137187957764, + "logits/rejected": -2.5520870685577393, + "logps/chosen": -119.62324523925781, + "logps/rejected": -126.19020080566406, + "loss": 0.6606, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -0.6352814435958862, + "rewards/margins": 0.10623638331890106, + "rewards/rejected": -0.7415178418159485, + "step": 5100 + }, + { + "epoch": 0.8804272915230875, + "grad_norm": 16.072378158569336, + "learning_rate": 8.902810045511753e-08, + "logits/chosen": -2.685297727584839, + "logits/rejected": -2.6612815856933594, + "logps/chosen": -126.0315933227539, + "logps/rejected": -130.48361206054688, + "loss": 0.6739, + "rewards/accuracies": 0.5562499761581421, + "rewards/chosen": -0.6738030314445496, + "rewards/margins": 0.09819827973842621, + "rewards/rejected": -0.772001326084137, + "step": 5110 + }, + { + "epoch": 0.8821502412129566, + "grad_norm": 15.368549346923828, + "learning_rate": 8.896536279077287e-08, + "logits/chosen": -2.690248966217041, + "logits/rejected": -2.668186902999878, + "logps/chosen": -111.31550598144531, + "logps/rejected": -119.72273254394531, + "loss": 0.6442, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -0.5687953233718872, + "rewards/margins": 0.1302814781665802, + "rewards/rejected": -0.6990768313407898, + "step": 5120 + }, + { + "epoch": 0.8838731909028257, + "grad_norm": 14.595114707946777, + "learning_rate": 8.89024685089124e-08, + "logits/chosen": -2.72908091545105, + "logits/rejected": -2.7011799812316895, + "logps/chosen": -114.46369934082031, + "logps/rejected": -120.93522644042969, + "loss": 0.6506, + "rewards/accuracies": 0.65625, + "rewards/chosen": -0.5987579822540283, + "rewards/margins": 0.11958907544612885, + "rewards/rejected": -0.7183471322059631, + "step": 5130 + }, + { + "epoch": 0.8855961405926946, + "grad_norm": 14.865375518798828, + "learning_rate": 8.883941786233363e-08, + "logits/chosen": -2.6741504669189453, + "logits/rejected": -2.646867275238037, + "logps/chosen": -115.9564208984375, + "logps/rejected": -119.01458740234375, + "loss": 0.6591, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.5695517659187317, + "rewards/margins": 0.10228168964385986, + "rewards/rejected": -0.6718333959579468, + "step": 5140 + }, + { + "epoch": 0.8873190902825637, + "grad_norm": 17.496746063232422, + "learning_rate": 8.877621110446253e-08, + "logits/chosen": -2.6742119789123535, + "logits/rejected": -2.6619982719421387, + "logps/chosen": -110.34440612792969, + "logps/rejected": -125.23421478271484, + "loss": 0.6378, + "rewards/accuracies": 0.65625, + "rewards/chosen": -0.562449038028717, + "rewards/margins": 0.15165221691131592, + "rewards/rejected": -0.7141013145446777, + "step": 5150 + }, + { + "epoch": 0.8890420399724328, + "grad_norm": 18.376039505004883, + "learning_rate": 8.871284848935256e-08, + "logits/chosen": -2.6834418773651123, + "logits/rejected": -2.6559715270996094, + "logps/chosen": -114.18431091308594, + "logps/rejected": -120.5025405883789, + "loss": 0.6589, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -0.6160685420036316, + "rewards/margins": 0.11094069480895996, + "rewards/rejected": -0.7270091772079468, + "step": 5160 + }, + { + "epoch": 0.8907649896623019, + "grad_norm": 11.63632583618164, + "learning_rate": 8.864933027168367e-08, + "logits/chosen": -2.6618878841400146, + "logits/rejected": -2.6420178413391113, + "logps/chosen": -114.88929748535156, + "logps/rejected": -129.62692260742188, + "loss": 0.6179, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -0.5751225352287292, + "rewards/margins": 0.2080199271440506, + "rewards/rejected": -0.7831424474716187, + "step": 5170 + }, + { + "epoch": 0.892487939352171, + "grad_norm": 18.189315795898438, + "learning_rate": 8.858565670676117e-08, + "logits/chosen": -2.742037296295166, + "logits/rejected": -2.7349371910095215, + "logps/chosen": -122.88726806640625, + "logps/rejected": -129.7611846923828, + "loss": 0.6676, + "rewards/accuracies": 0.606249988079071, + "rewards/chosen": -0.6671861410140991, + "rewards/margins": 0.0882277563214302, + "rewards/rejected": -0.755413830280304, + "step": 5180 + }, + { + "epoch": 0.8942108890420399, + "grad_norm": 16.983613967895508, + "learning_rate": 8.852182805051485e-08, + "logits/chosen": -2.6411783695220947, + "logits/rejected": -2.6334564685821533, + "logps/chosen": -120.18841552734375, + "logps/rejected": -129.97769165039062, + "loss": 0.6659, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -0.671235203742981, + "rewards/margins": 0.09620723873376846, + "rewards/rejected": -0.767442524433136, + "step": 5190 + }, + { + "epoch": 0.895933838731909, + "grad_norm": 16.245668411254883, + "learning_rate": 8.845784455949778e-08, + "logits/chosen": -2.712747097015381, + "logits/rejected": -2.692870616912842, + "logps/chosen": -119.13944244384766, + "logps/rejected": -133.7500457763672, + "loss": 0.6383, + "rewards/accuracies": 0.65625, + "rewards/chosen": -0.6625715494155884, + "rewards/margins": 0.1500093638896942, + "rewards/rejected": -0.812580943107605, + "step": 5200 + }, + { + "epoch": 0.895933838731909, + "eval_logits/chosen": -2.714919328689575, + "eval_logits/rejected": -2.7111740112304688, + "eval_logps/chosen": -110.81185913085938, + "eval_logps/rejected": -123.86541748046875, + "eval_loss": 0.6621139049530029, + "eval_rewards/accuracies": 0.6054832935333252, + "eval_rewards/chosen": -0.5179638266563416, + "eval_rewards/margins": 0.0931943878531456, + "eval_rewards/rejected": -0.6111582517623901, + "eval_runtime": 383.2873, + "eval_samples_per_second": 11.229, + "eval_steps_per_second": 1.404, + "step": 5200 + }, + { + "epoch": 0.8976567884217781, + "grad_norm": 13.328145980834961, + "learning_rate": 8.839370649088546e-08, + "logits/chosen": -2.6668667793273926, + "logits/rejected": -2.65277361869812, + "logps/chosen": -119.27299499511719, + "logps/rejected": -124.73472595214844, + "loss": 0.6707, + "rewards/accuracies": 0.59375, + "rewards/chosen": -0.6674652099609375, + "rewards/margins": 0.08735474944114685, + "rewards/rejected": -0.7548199892044067, + "step": 5210 + }, + { + "epoch": 0.8993797381116472, + "grad_norm": 16.024778366088867, + "learning_rate": 8.83294141024747e-08, + "logits/chosen": -2.683475971221924, + "logits/rejected": -2.6743195056915283, + "logps/chosen": -113.74040222167969, + "logps/rejected": -123.02177429199219, + "loss": 0.6752, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": -0.605230450630188, + "rewards/margins": 0.07791049778461456, + "rewards/rejected": -0.6831408739089966, + "step": 5220 + }, + { + "epoch": 0.9011026878015161, + "grad_norm": 15.768523216247559, + "learning_rate": 8.826496765268248e-08, + "logits/chosen": -2.6493020057678223, + "logits/rejected": -2.642354726791382, + "logps/chosen": -114.64256286621094, + "logps/rejected": -130.42835998535156, + "loss": 0.6239, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -0.5734395980834961, + "rewards/margins": 0.18039682507514954, + "rewards/rejected": -0.7538365125656128, + "step": 5230 + }, + { + "epoch": 0.9028256374913852, + "grad_norm": 16.26175308227539, + "learning_rate": 8.820036740054516e-08, + "logits/chosen": -2.6304409503936768, + "logits/rejected": -2.6143264770507812, + "logps/chosen": -116.3860092163086, + "logps/rejected": -136.6753692626953, + "loss": 0.6156, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -0.6271839141845703, + "rewards/margins": 0.2139817178249359, + "rewards/rejected": -0.8411655426025391, + "step": 5240 + }, + { + "epoch": 0.9045485871812543, + "grad_norm": 15.620439529418945, + "learning_rate": 8.813561360571715e-08, + "logits/chosen": -2.5899417400360107, + "logits/rejected": -2.5807578563690186, + "logps/chosen": -114.19087219238281, + "logps/rejected": -127.84722900390625, + "loss": 0.6389, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -0.6201349496841431, + "rewards/margins": 0.1543225795030594, + "rewards/rejected": -0.774457573890686, + "step": 5250 + }, + { + "epoch": 0.9062715368711234, + "grad_norm": 22.585975646972656, + "learning_rate": 8.807070652847014e-08, + "logits/chosen": -2.6384902000427246, + "logits/rejected": -2.607151508331299, + "logps/chosen": -118.38133239746094, + "logps/rejected": -128.50912475585938, + "loss": 0.6337, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -0.6117981672286987, + "rewards/margins": 0.16677974164485931, + "rewards/rejected": -0.778577983379364, + "step": 5260 + }, + { + "epoch": 0.9079944865609925, + "grad_norm": 15.792411804199219, + "learning_rate": 8.800564642969182e-08, + "logits/chosen": -2.722902297973633, + "logits/rejected": -2.7080540657043457, + "logps/chosen": -117.5638656616211, + "logps/rejected": -130.32357788085938, + "loss": 0.6436, + "rewards/accuracies": 0.6187499761581421, + "rewards/chosen": -0.6469516754150391, + "rewards/margins": 0.15124547481536865, + "rewards/rejected": -0.7981971502304077, + "step": 5270 + }, + { + "epoch": 0.9097174362508614, + "grad_norm": 14.785796165466309, + "learning_rate": 8.794043357088501e-08, + "logits/chosen": -2.6592485904693604, + "logits/rejected": -2.636044979095459, + "logps/chosen": -121.824951171875, + "logps/rejected": -129.5763702392578, + "loss": 0.6482, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -0.6678295135498047, + "rewards/margins": 0.13105496764183044, + "rewards/rejected": -0.7988845109939575, + "step": 5280 + }, + { + "epoch": 0.9114403859407305, + "grad_norm": 20.555374145507812, + "learning_rate": 8.787506821416648e-08, + "logits/chosen": -2.6226296424865723, + "logits/rejected": -2.5907247066497803, + "logps/chosen": -119.1449966430664, + "logps/rejected": -126.7865219116211, + "loss": 0.6474, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -0.621123194694519, + "rewards/margins": 0.1367107331752777, + "rewards/rejected": -0.7578339576721191, + "step": 5290 + }, + { + "epoch": 0.9131633356305996, + "grad_norm": 16.926069259643555, + "learning_rate": 8.780955062226598e-08, + "logits/chosen": -2.634366750717163, + "logits/rejected": -2.625247001647949, + "logps/chosen": -115.128662109375, + "logps/rejected": -128.33938598632812, + "loss": 0.6332, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -0.5826433897018433, + "rewards/margins": 0.15657159686088562, + "rewards/rejected": -0.7392150163650513, + "step": 5300 + }, + { + "epoch": 0.9148862853204687, + "grad_norm": 19.48094367980957, + "learning_rate": 8.774388105852517e-08, + "logits/chosen": -2.7098991870880127, + "logits/rejected": -2.7008213996887207, + "logps/chosen": -123.44694519042969, + "logps/rejected": -125.55406188964844, + "loss": 0.6657, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": -0.6424458622932434, + "rewards/margins": 0.09618823230266571, + "rewards/rejected": -0.7386341094970703, + "step": 5310 + }, + { + "epoch": 0.9166092350103378, + "grad_norm": 17.43693733215332, + "learning_rate": 8.767805978689651e-08, + "logits/chosen": -2.655910015106201, + "logits/rejected": -2.611663818359375, + "logps/chosen": -121.01139068603516, + "logps/rejected": -127.2607192993164, + "loss": 0.633, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -0.621241569519043, + "rewards/margins": 0.16151300072669983, + "rewards/rejected": -0.7827545404434204, + "step": 5320 + }, + { + "epoch": 0.9183321847002067, + "grad_norm": 15.040596961975098, + "learning_rate": 8.761208707194223e-08, + "logits/chosen": -2.6152052879333496, + "logits/rejected": -2.6168980598449707, + "logps/chosen": -116.71577453613281, + "logps/rejected": -136.49380493164062, + "loss": 0.6233, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.6542342305183411, + "rewards/margins": 0.19011548161506653, + "rewards/rejected": -0.84434974193573, + "step": 5330 + }, + { + "epoch": 0.9200551343900758, + "grad_norm": 20.719085693359375, + "learning_rate": 8.754596317883332e-08, + "logits/chosen": -2.6348819732666016, + "logits/rejected": -2.5992424488067627, + "logps/chosen": -123.36546325683594, + "logps/rejected": -130.93771362304688, + "loss": 0.6367, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -0.6524416208267212, + "rewards/margins": 0.14736978709697723, + "rewards/rejected": -0.7998114824295044, + "step": 5340 + }, + { + "epoch": 0.9217780840799449, + "grad_norm": 13.634549140930176, + "learning_rate": 8.747968837334837e-08, + "logits/chosen": -2.6150898933410645, + "logits/rejected": -2.590125322341919, + "logps/chosen": -120.84356689453125, + "logps/rejected": -132.3267364501953, + "loss": 0.6436, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": -0.6730222702026367, + "rewards/margins": 0.14201988279819489, + "rewards/rejected": -0.8150421380996704, + "step": 5350 + }, + { + "epoch": 0.923501033769814, + "grad_norm": 19.82248306274414, + "learning_rate": 8.741326292187257e-08, + "logits/chosen": -2.666996479034424, + "logits/rejected": -2.6632487773895264, + "logps/chosen": -118.6470947265625, + "logps/rejected": -140.34963989257812, + "loss": 0.6188, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -0.6400326490402222, + "rewards/margins": 0.20354370772838593, + "rewards/rejected": -0.8435763120651245, + "step": 5360 + }, + { + "epoch": 0.9252239834596829, + "grad_norm": 21.43902587890625, + "learning_rate": 8.734668709139663e-08, + "logits/chosen": -2.610877752304077, + "logits/rejected": -2.6005136966705322, + "logps/chosen": -122.8233413696289, + "logps/rejected": -130.63795471191406, + "loss": 0.6712, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -0.6907214522361755, + "rewards/margins": 0.08820674568414688, + "rewards/rejected": -0.7789281606674194, + "step": 5370 + }, + { + "epoch": 0.926946933149552, + "grad_norm": 18.302011489868164, + "learning_rate": 8.727996114951566e-08, + "logits/chosen": -2.683851957321167, + "logits/rejected": -2.658590316772461, + "logps/chosen": -130.3781280517578, + "logps/rejected": -135.687744140625, + "loss": 0.6489, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -0.7127219438552856, + "rewards/margins": 0.13830821216106415, + "rewards/rejected": -0.851030170917511, + "step": 5380 + }, + { + "epoch": 0.9286698828394211, + "grad_norm": 18.31463050842285, + "learning_rate": 8.721308536442814e-08, + "logits/chosen": -2.6187527179718018, + "logits/rejected": -2.5875821113586426, + "logps/chosen": -130.81814575195312, + "logps/rejected": -134.22793579101562, + "loss": 0.6592, + "rewards/accuracies": 0.6187499761581421, + "rewards/chosen": -0.7203988432884216, + "rewards/margins": 0.1155281662940979, + "rewards/rejected": -0.8359270095825195, + "step": 5390 + }, + { + "epoch": 0.9303928325292902, + "grad_norm": 17.170841217041016, + "learning_rate": 8.714606000493482e-08, + "logits/chosen": -2.634840250015259, + "logits/rejected": -2.6232903003692627, + "logps/chosen": -115.47178649902344, + "logps/rejected": -137.46221923828125, + "loss": 0.6212, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.6360267400741577, + "rewards/margins": 0.20517496764659882, + "rewards/rejected": -0.8412017822265625, + "step": 5400 + }, + { + "epoch": 0.9321157822191593, + "grad_norm": 29.442955017089844, + "learning_rate": 8.707888534043772e-08, + "logits/chosen": -2.6460509300231934, + "logits/rejected": -2.6388349533081055, + "logps/chosen": -129.9828643798828, + "logps/rejected": -134.42715454101562, + "loss": 0.674, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.7190936803817749, + "rewards/margins": 0.08293198049068451, + "rewards/rejected": -0.8020256161689758, + "step": 5410 + }, + { + "epoch": 0.9338387319090282, + "grad_norm": 22.263671875, + "learning_rate": 8.701156164093888e-08, + "logits/chosen": -2.6946921348571777, + "logits/rejected": -2.6782920360565186, + "logps/chosen": -123.95640563964844, + "logps/rejected": -132.80471801757812, + "loss": 0.6583, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -0.6873513460159302, + "rewards/margins": 0.1127699613571167, + "rewards/rejected": -0.8001214265823364, + "step": 5420 + }, + { + "epoch": 0.9355616815988973, + "grad_norm": 18.80925178527832, + "learning_rate": 8.694408917703942e-08, + "logits/chosen": -2.6444718837738037, + "logits/rejected": -2.636350631713867, + "logps/chosen": -126.26493072509766, + "logps/rejected": -136.88291931152344, + "loss": 0.6464, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -0.720744252204895, + "rewards/margins": 0.14062415063381195, + "rewards/rejected": -0.8613685369491577, + "step": 5430 + }, + { + "epoch": 0.9372846312887664, + "grad_norm": 21.596277236938477, + "learning_rate": 8.68764682199384e-08, + "logits/chosen": -2.649989604949951, + "logits/rejected": -2.637254476547241, + "logps/chosen": -127.28349304199219, + "logps/rejected": -138.6378631591797, + "loss": 0.6493, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -0.7071490287780762, + "rewards/margins": 0.13525404036045074, + "rewards/rejected": -0.8424030542373657, + "step": 5440 + }, + { + "epoch": 0.9390075809786355, + "grad_norm": 19.736589431762695, + "learning_rate": 8.680869904143172e-08, + "logits/chosen": -2.606205463409424, + "logits/rejected": -2.5994503498077393, + "logps/chosen": -126.2559585571289, + "logps/rejected": -139.34756469726562, + "loss": 0.633, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.6941956877708435, + "rewards/margins": 0.17159244418144226, + "rewards/rejected": -0.8657881617546082, + "step": 5450 + }, + { + "epoch": 0.9407305306685044, + "grad_norm": 16.4277400970459, + "learning_rate": 8.674078191391108e-08, + "logits/chosen": -2.6205191612243652, + "logits/rejected": -2.6084847450256348, + "logps/chosen": -122.77671813964844, + "logps/rejected": -127.8461685180664, + "loss": 0.671, + "rewards/accuracies": 0.6187499761581421, + "rewards/chosen": -0.6828287243843079, + "rewards/margins": 0.08287617564201355, + "rewards/rejected": -0.7657049298286438, + "step": 5460 + }, + { + "epoch": 0.9424534803583735, + "grad_norm": 15.285874366760254, + "learning_rate": 8.66727171103628e-08, + "logits/chosen": -2.6129372119903564, + "logits/rejected": -2.6003642082214355, + "logps/chosen": -121.86590576171875, + "logps/rejected": -127.73139953613281, + "loss": 0.6835, + "rewards/accuracies": 0.5375000238418579, + "rewards/chosen": -0.6971479654312134, + "rewards/margins": 0.0788634642958641, + "rewards/rejected": -0.7760114073753357, + "step": 5470 + }, + { + "epoch": 0.9441764300482426, + "grad_norm": 17.484487533569336, + "learning_rate": 8.66045049043668e-08, + "logits/chosen": -2.6272430419921875, + "logits/rejected": -2.60965633392334, + "logps/chosen": -120.57098388671875, + "logps/rejected": -134.08132934570312, + "loss": 0.6295, + "rewards/accuracies": 0.71875, + "rewards/chosen": -0.6604935526847839, + "rewards/margins": 0.17122098803520203, + "rewards/rejected": -0.8317145109176636, + "step": 5480 + }, + { + "epoch": 0.9458993797381117, + "grad_norm": 15.28332233428955, + "learning_rate": 8.653614557009546e-08, + "logits/chosen": -2.6022276878356934, + "logits/rejected": -2.581787109375, + "logps/chosen": -122.64559173583984, + "logps/rejected": -127.6083984375, + "loss": 0.6602, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -0.6580213308334351, + "rewards/margins": 0.1082652360200882, + "rewards/rejected": -0.7662865519523621, + "step": 5490 + }, + { + "epoch": 0.9476223294279807, + "grad_norm": 18.69580078125, + "learning_rate": 8.646763938231252e-08, + "logits/chosen": -2.6790108680725098, + "logits/rejected": -2.6685914993286133, + "logps/chosen": -118.9625473022461, + "logps/rejected": -138.61312866210938, + "loss": 0.6201, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -0.6429628133773804, + "rewards/margins": 0.20192137360572815, + "rewards/rejected": -0.8448840975761414, + "step": 5500 + }, + { + "epoch": 0.9493452791178497, + "grad_norm": 23.761962890625, + "learning_rate": 8.6398986616372e-08, + "logits/chosen": -2.6038689613342285, + "logits/rejected": -2.593151569366455, + "logps/chosen": -124.92496490478516, + "logps/rejected": -134.55838012695312, + "loss": 0.6578, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -0.7127937078475952, + "rewards/margins": 0.10803844779729843, + "rewards/rejected": -0.8208320736885071, + "step": 5510 + }, + { + "epoch": 0.9510682288077188, + "grad_norm": 18.504318237304688, + "learning_rate": 8.633018754821704e-08, + "logits/chosen": -2.595757007598877, + "logits/rejected": -2.5816268920898438, + "logps/chosen": -124.0481948852539, + "logps/rejected": -135.4131317138672, + "loss": 0.6265, + "rewards/accuracies": 0.65625, + "rewards/chosen": -0.6618342399597168, + "rewards/margins": 0.18380990624427795, + "rewards/rejected": -0.8456441760063171, + "step": 5520 + }, + { + "epoch": 0.9527911784975879, + "grad_norm": 15.531015396118164, + "learning_rate": 8.62612424543789e-08, + "logits/chosen": -2.640845775604248, + "logits/rejected": -2.612797498703003, + "logps/chosen": -131.6069793701172, + "logps/rejected": -133.2763671875, + "loss": 0.6602, + "rewards/accuracies": 0.6187499761581421, + "rewards/chosen": -0.7045750617980957, + "rewards/margins": 0.11494274437427521, + "rewards/rejected": -0.8195177912712097, + "step": 5530 + }, + { + "epoch": 0.954514128187457, + "grad_norm": 19.734302520751953, + "learning_rate": 8.61921516119757e-08, + "logits/chosen": -2.624042510986328, + "logits/rejected": -2.6247589588165283, + "logps/chosen": -121.94276428222656, + "logps/rejected": -139.32431030273438, + "loss": 0.6441, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.6817771792411804, + "rewards/margins": 0.15398737788200378, + "rewards/rejected": -0.8357645869255066, + "step": 5540 + }, + { + "epoch": 0.956237077877326, + "grad_norm": 16.820507049560547, + "learning_rate": 8.612291529871146e-08, + "logits/chosen": -2.542262554168701, + "logits/rejected": -2.530670642852783, + "logps/chosen": -124.15339660644531, + "logps/rejected": -132.162109375, + "loss": 0.6824, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -0.7120052576065063, + "rewards/margins": 0.07154157012701035, + "rewards/rejected": -0.7835467457771301, + "step": 5550 + }, + { + "epoch": 0.957960027567195, + "grad_norm": 19.347867965698242, + "learning_rate": 8.605353379287478e-08, + "logits/chosen": -2.6067452430725098, + "logits/rejected": -2.5873658657073975, + "logps/chosen": -117.42878723144531, + "logps/rejected": -125.9688720703125, + "loss": 0.6513, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.6437867879867554, + "rewards/margins": 0.1278226375579834, + "rewards/rejected": -0.7716094255447388, + "step": 5560 + }, + { + "epoch": 0.9596829772570641, + "grad_norm": 15.14845085144043, + "learning_rate": 8.5984007373338e-08, + "logits/chosen": -2.6609604358673096, + "logits/rejected": -2.655918836593628, + "logps/chosen": -116.06624603271484, + "logps/rejected": -131.27557373046875, + "loss": 0.6581, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.6413980722427368, + "rewards/margins": 0.1107332855463028, + "rewards/rejected": -0.7521313428878784, + "step": 5570 + }, + { + "epoch": 0.9614059269469332, + "grad_norm": 18.013757705688477, + "learning_rate": 8.591433631955582e-08, + "logits/chosen": -2.582545757293701, + "logits/rejected": -2.574622392654419, + "logps/chosen": -118.67476654052734, + "logps/rejected": -134.64376831054688, + "loss": 0.6315, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -0.645140528678894, + "rewards/margins": 0.16778312623500824, + "rewards/rejected": -0.8129236102104187, + "step": 5580 + }, + { + "epoch": 0.9631288766368022, + "grad_norm": 15.750480651855469, + "learning_rate": 8.584452091156432e-08, + "logits/chosen": -2.675096035003662, + "logits/rejected": -2.6426644325256348, + "logps/chosen": -120.98304748535156, + "logps/rejected": -134.9139404296875, + "loss": 0.6418, + "rewards/accuracies": 0.65625, + "rewards/chosen": -0.6380943059921265, + "rewards/margins": 0.1646120846271515, + "rewards/rejected": -0.8027063608169556, + "step": 5590 + }, + { + "epoch": 0.9648518263266712, + "grad_norm": 21.686899185180664, + "learning_rate": 8.57745614299798e-08, + "logits/chosen": -2.6570706367492676, + "logits/rejected": -2.6563992500305176, + "logps/chosen": -120.1393051147461, + "logps/rejected": -138.4915008544922, + "loss": 0.6411, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -0.6697486042976379, + "rewards/margins": 0.1515139788389206, + "rewards/rejected": -0.8212626576423645, + "step": 5600 + }, + { + "epoch": 0.9648518263266712, + "eval_logits/chosen": -2.6910486221313477, + "eval_logits/rejected": -2.686899423599243, + "eval_logps/chosen": -111.2964859008789, + "eval_logps/rejected": -124.09293365478516, + "eval_loss": 0.6622869372367859, + "eval_rewards/accuracies": 0.6054832935333252, + "eval_rewards/chosen": -0.5228100419044495, + "eval_rewards/margins": 0.09062344580888748, + "eval_rewards/rejected": -0.6134334802627563, + "eval_runtime": 383.39, + "eval_samples_per_second": 11.226, + "eval_steps_per_second": 1.403, + "step": 5600 + }, + { + "epoch": 0.9665747760165403, + "grad_norm": 18.368070602416992, + "learning_rate": 8.570445815599767e-08, + "logits/chosen": -2.6654767990112305, + "logits/rejected": -2.654480218887329, + "logps/chosen": -121.57368469238281, + "logps/rejected": -145.0277557373047, + "loss": 0.6176, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.6873144507408142, + "rewards/margins": 0.208953857421875, + "rewards/rejected": -0.8962682485580444, + "step": 5610 + }, + { + "epoch": 0.9682977257064094, + "grad_norm": 26.18334197998047, + "learning_rate": 8.563421137139123e-08, + "logits/chosen": -2.59879994392395, + "logits/rejected": -2.5828347206115723, + "logps/chosen": -129.18121337890625, + "logps/rejected": -138.1641082763672, + "loss": 0.6486, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -0.7375103831291199, + "rewards/margins": 0.14180725812911987, + "rewards/rejected": -0.8793177604675293, + "step": 5620 + }, + { + "epoch": 0.9700206753962785, + "grad_norm": 18.84343719482422, + "learning_rate": 8.556382135851068e-08, + "logits/chosen": -2.6542553901672363, + "logits/rejected": -2.6300244331359863, + "logps/chosen": -127.53597259521484, + "logps/rejected": -133.26052856445312, + "loss": 0.6593, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -0.7077816724777222, + "rewards/margins": 0.119412362575531, + "rewards/rejected": -0.8271940350532532, + "step": 5630 + }, + { + "epoch": 0.9717436250861475, + "grad_norm": 21.3780574798584, + "learning_rate": 8.549328840028187e-08, + "logits/chosen": -2.6287741661071777, + "logits/rejected": -2.618954658508301, + "logps/chosen": -120.96806335449219, + "logps/rejected": -137.76504516601562, + "loss": 0.641, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -0.6567273139953613, + "rewards/margins": 0.16713783144950867, + "rewards/rejected": -0.8238651156425476, + "step": 5640 + }, + { + "epoch": 0.9734665747760165, + "grad_norm": 19.235515594482422, + "learning_rate": 8.542261278020524e-08, + "logits/chosen": -2.560713291168213, + "logits/rejected": -2.5502090454101562, + "logps/chosen": -119.46761322021484, + "logps/rejected": -136.87493896484375, + "loss": 0.6323, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -0.6769036054611206, + "rewards/margins": 0.17919696867465973, + "rewards/rejected": -0.8561005592346191, + "step": 5650 + }, + { + "epoch": 0.9751895244658856, + "grad_norm": 19.590194702148438, + "learning_rate": 8.535179478235461e-08, + "logits/chosen": -2.5745973587036133, + "logits/rejected": -2.5703866481781006, + "logps/chosen": -124.3803939819336, + "logps/rejected": -138.00247192382812, + "loss": 0.6457, + "rewards/accuracies": 0.6187499761581421, + "rewards/chosen": -0.7225823998451233, + "rewards/margins": 0.15808936953544617, + "rewards/rejected": -0.8806716799736023, + "step": 5660 + }, + { + "epoch": 0.9769124741557547, + "grad_norm": 16.50753402709961, + "learning_rate": 8.52808346913761e-08, + "logits/chosen": -2.600623846054077, + "logits/rejected": -2.590681552886963, + "logps/chosen": -126.36222839355469, + "logps/rejected": -136.4040985107422, + "loss": 0.6496, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -0.7098848819732666, + "rewards/margins": 0.1333651840686798, + "rewards/rejected": -0.8432499766349792, + "step": 5670 + }, + { + "epoch": 0.9786354238456237, + "grad_norm": 22.130084991455078, + "learning_rate": 8.520973279248694e-08, + "logits/chosen": -2.6281137466430664, + "logits/rejected": -2.5988149642944336, + "logps/chosen": -130.99139404296875, + "logps/rejected": -144.82325744628906, + "loss": 0.6252, + "rewards/accuracies": 0.71875, + "rewards/chosen": -0.7589556574821472, + "rewards/margins": 0.17990081012248993, + "rewards/rejected": -0.9388564825057983, + "step": 5680 + }, + { + "epoch": 0.9803583735354927, + "grad_norm": 17.92255210876465, + "learning_rate": 8.513848937147434e-08, + "logits/chosen": -2.651932954788208, + "logits/rejected": -2.62499737739563, + "logps/chosen": -135.4591064453125, + "logps/rejected": -145.86044311523438, + "loss": 0.6232, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -0.754391074180603, + "rewards/margins": 0.19621720910072327, + "rewards/rejected": -0.9506082534790039, + "step": 5690 + }, + { + "epoch": 0.9820813232253618, + "grad_norm": 31.532100677490234, + "learning_rate": 8.506710471469438e-08, + "logits/chosen": -2.5525314807891846, + "logits/rejected": -2.5413818359375, + "logps/chosen": -135.53036499023438, + "logps/rejected": -146.36544799804688, + "loss": 0.6457, + "rewards/accuracies": 0.65625, + "rewards/chosen": -0.774732232093811, + "rewards/margins": 0.15803535282611847, + "rewards/rejected": -0.9327676892280579, + "step": 5700 + }, + { + "epoch": 0.9838042729152309, + "grad_norm": 17.127050399780273, + "learning_rate": 8.499557910907078e-08, + "logits/chosen": -2.657351493835449, + "logits/rejected": -2.638766050338745, + "logps/chosen": -135.3360137939453, + "logps/rejected": -150.8716278076172, + "loss": 0.6277, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -0.7916986346244812, + "rewards/margins": 0.19900405406951904, + "rewards/rejected": -0.9907026290893555, + "step": 5710 + }, + { + "epoch": 0.9855272226051, + "grad_norm": 21.06365394592285, + "learning_rate": 8.492391284209383e-08, + "logits/chosen": -2.6116607189178467, + "logits/rejected": -2.5935304164886475, + "logps/chosen": -131.7152862548828, + "logps/rejected": -146.989013671875, + "loss": 0.6208, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -0.7815282344818115, + "rewards/margins": 0.20366664230823517, + "rewards/rejected": -0.9851948618888855, + "step": 5720 + }, + { + "epoch": 0.987250172294969, + "grad_norm": 18.155738830566406, + "learning_rate": 8.485210620181915e-08, + "logits/chosen": -2.674208402633667, + "logits/rejected": -2.6654229164123535, + "logps/chosen": -129.9379425048828, + "logps/rejected": -144.26626586914062, + "loss": 0.6416, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.7705144882202148, + "rewards/margins": 0.16067850589752197, + "rewards/rejected": -0.9311929941177368, + "step": 5730 + }, + { + "epoch": 0.988973121984838, + "grad_norm": 19.837358474731445, + "learning_rate": 8.478015947686664e-08, + "logits/chosen": -2.5953633785247803, + "logits/rejected": -2.572862148284912, + "logps/chosen": -142.89976501464844, + "logps/rejected": -153.91061401367188, + "loss": 0.6368, + "rewards/accuracies": 0.606249988079071, + "rewards/chosen": -0.8325818777084351, + "rewards/margins": 0.17331066727638245, + "rewards/rejected": -1.0058925151824951, + "step": 5740 + }, + { + "epoch": 0.9906960716747071, + "grad_norm": 28.6536808013916, + "learning_rate": 8.470807295641917e-08, + "logits/chosen": -2.668067455291748, + "logits/rejected": -2.651562213897705, + "logps/chosen": -136.50344848632812, + "logps/rejected": -136.56271362304688, + "loss": 0.6732, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -0.7828587293624878, + "rewards/margins": 0.08392515778541565, + "rewards/rejected": -0.8667839169502258, + "step": 5750 + }, + { + "epoch": 0.9924190213645762, + "grad_norm": 16.913461685180664, + "learning_rate": 8.463584693022156e-08, + "logits/chosen": -2.6280453205108643, + "logits/rejected": -2.607534408569336, + "logps/chosen": -129.51055908203125, + "logps/rejected": -139.5177001953125, + "loss": 0.6549, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -0.7246989011764526, + "rewards/margins": 0.12306500971317291, + "rewards/rejected": -0.8477638959884644, + "step": 5760 + }, + { + "epoch": 0.9941419710544452, + "grad_norm": 18.49822998046875, + "learning_rate": 8.45634816885794e-08, + "logits/chosen": -2.5809426307678223, + "logits/rejected": -2.5648703575134277, + "logps/chosen": -118.6301040649414, + "logps/rejected": -136.79217529296875, + "loss": 0.6287, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -0.659207820892334, + "rewards/margins": 0.17542117834091187, + "rewards/rejected": -0.8346290588378906, + "step": 5770 + }, + { + "epoch": 0.9958649207443143, + "grad_norm": 24.4649600982666, + "learning_rate": 8.449097752235776e-08, + "logits/chosen": -2.5920777320861816, + "logits/rejected": -2.577484607696533, + "logps/chosen": -127.2874984741211, + "logps/rejected": -141.57028198242188, + "loss": 0.649, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -0.7384024858474731, + "rewards/margins": 0.1452442705631256, + "rewards/rejected": -0.8836467862129211, + "step": 5780 + }, + { + "epoch": 0.9975878704341833, + "grad_norm": 15.391559600830078, + "learning_rate": 8.441833472298014e-08, + "logits/chosen": -2.556666612625122, + "logits/rejected": -2.527010440826416, + "logps/chosen": -114.5416259765625, + "logps/rejected": -134.868408203125, + "loss": 0.6269, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -0.6321970224380493, + "rewards/margins": 0.17795422673225403, + "rewards/rejected": -0.8101511001586914, + "step": 5790 + }, + { + "epoch": 0.9993108201240524, + "grad_norm": 18.151533126831055, + "learning_rate": 8.434555358242728e-08, + "logits/chosen": -2.6377694606781006, + "logits/rejected": -2.6138923168182373, + "logps/chosen": -118.05354309082031, + "logps/rejected": -138.7180633544922, + "loss": 0.6149, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -0.6627640128135681, + "rewards/margins": 0.2130320817232132, + "rewards/rejected": -0.8757961392402649, + "step": 5800 + }, + { + "epoch": 1.0010337698139213, + "grad_norm": 19.002288818359375, + "learning_rate": 8.427263439323593e-08, + "logits/chosen": -2.645638942718506, + "logits/rejected": -2.6332497596740723, + "logps/chosen": -130.27548217773438, + "logps/rejected": -143.0097198486328, + "loss": 0.6485, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.7482485771179199, + "rewards/margins": 0.15110786259174347, + "rewards/rejected": -0.8993565440177917, + "step": 5810 + }, + { + "epoch": 1.0027567195037905, + "grad_norm": 17.686214447021484, + "learning_rate": 8.419957744849773e-08, + "logits/chosen": -2.644108533859253, + "logits/rejected": -2.6132802963256836, + "logps/chosen": -132.90487670898438, + "logps/rejected": -155.9635009765625, + "loss": 0.5902, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -0.7895268797874451, + "rewards/margins": 0.28397274017333984, + "rewards/rejected": -1.0734995603561401, + "step": 5820 + }, + { + "epoch": 1.0044796691936595, + "grad_norm": 26.239715576171875, + "learning_rate": 8.412638304185805e-08, + "logits/chosen": -2.5749363899230957, + "logits/rejected": -2.5539708137512207, + "logps/chosen": -139.7153778076172, + "logps/rejected": -152.18222045898438, + "loss": 0.6493, + "rewards/accuracies": 0.65625, + "rewards/chosen": -0.8502417802810669, + "rewards/margins": 0.15593525767326355, + "rewards/rejected": -1.0061770677566528, + "step": 5830 + }, + { + "epoch": 1.0062026188835287, + "grad_norm": 17.349905014038086, + "learning_rate": 8.405305146751472e-08, + "logits/chosen": -2.599428653717041, + "logits/rejected": -2.5850729942321777, + "logps/chosen": -136.01486206054688, + "logps/rejected": -159.19105529785156, + "loss": 0.6208, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -0.8267250061035156, + "rewards/margins": 0.2275165319442749, + "rewards/rejected": -1.0542415380477905, + "step": 5840 + }, + { + "epoch": 1.0079255685733977, + "grad_norm": 22.872417449951172, + "learning_rate": 8.397958302021695e-08, + "logits/chosen": -2.632000684738159, + "logits/rejected": -2.614732265472412, + "logps/chosen": -128.5465087890625, + "logps/rejected": -157.53292846679688, + "loss": 0.5957, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -0.7695121765136719, + "rewards/margins": 0.2701513171195984, + "rewards/rejected": -1.039663553237915, + "step": 5850 + }, + { + "epoch": 1.0096485182632666, + "grad_norm": 20.183704376220703, + "learning_rate": 8.390597799526404e-08, + "logits/chosen": -2.5346813201904297, + "logits/rejected": -2.5250442028045654, + "logps/chosen": -139.59866333007812, + "logps/rejected": -159.0569610595703, + "loss": 0.6271, + "rewards/accuracies": 0.65625, + "rewards/chosen": -0.8646712303161621, + "rewards/margins": 0.20986256003379822, + "rewards/rejected": -1.0745337009429932, + "step": 5860 + }, + { + "epoch": 1.0113714679531358, + "grad_norm": 19.863487243652344, + "learning_rate": 8.383223668850433e-08, + "logits/chosen": -2.57783579826355, + "logits/rejected": -2.560011386871338, + "logps/chosen": -142.1617431640625, + "logps/rejected": -161.0272216796875, + "loss": 0.6222, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -0.8560919761657715, + "rewards/margins": 0.21619930863380432, + "rewards/rejected": -1.072291374206543, + "step": 5870 + }, + { + "epoch": 1.0130944176430048, + "grad_norm": 22.493450164794922, + "learning_rate": 8.375835939633384e-08, + "logits/chosen": -2.620809555053711, + "logits/rejected": -2.608182668685913, + "logps/chosen": -133.28724670410156, + "logps/rejected": -148.15118408203125, + "loss": 0.629, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -0.7764428853988647, + "rewards/margins": 0.18140621483325958, + "rewards/rejected": -0.9578492045402527, + "step": 5880 + }, + { + "epoch": 1.014817367332874, + "grad_norm": 33.157630920410156, + "learning_rate": 8.368434641569524e-08, + "logits/chosen": -2.6399178504943848, + "logits/rejected": -2.6295580863952637, + "logps/chosen": -138.3361053466797, + "logps/rejected": -154.23565673828125, + "loss": 0.6492, + "rewards/accuracies": 0.6187499761581421, + "rewards/chosen": -0.8195241093635559, + "rewards/margins": 0.15121665596961975, + "rewards/rejected": -0.970740795135498, + "step": 5890 + }, + { + "epoch": 1.016540317022743, + "grad_norm": 20.49550437927246, + "learning_rate": 8.361019804407657e-08, + "logits/chosen": -2.5564870834350586, + "logits/rejected": -2.540733575820923, + "logps/chosen": -143.91696166992188, + "logps/rejected": -165.53414916992188, + "loss": 0.6051, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -0.8727455139160156, + "rewards/margins": 0.24948236346244812, + "rewards/rejected": -1.1222279071807861, + "step": 5900 + }, + { + "epoch": 1.018263266712612, + "grad_norm": 27.403352737426758, + "learning_rate": 8.353591457951005e-08, + "logits/chosen": -2.5638561248779297, + "logits/rejected": -2.566239356994629, + "logps/chosen": -138.6494598388672, + "logps/rejected": -158.37081909179688, + "loss": 0.6432, + "rewards/accuracies": 0.65625, + "rewards/chosen": -0.8563209772109985, + "rewards/margins": 0.16037517786026, + "rewards/rejected": -1.0166962146759033, + "step": 5910 + }, + { + "epoch": 1.019986216402481, + "grad_norm": 21.737804412841797, + "learning_rate": 8.346149632057089e-08, + "logits/chosen": -2.5654804706573486, + "logits/rejected": -2.547624111175537, + "logps/chosen": -138.55712890625, + "logps/rejected": -150.98326110839844, + "loss": 0.6675, + "rewards/accuracies": 0.581250011920929, + "rewards/chosen": -0.8675752878189087, + "rewards/margins": 0.11592147499322891, + "rewards/rejected": -0.9834968447685242, + "step": 5920 + }, + { + "epoch": 1.02170916609235, + "grad_norm": 22.046180725097656, + "learning_rate": 8.338694356637612e-08, + "logits/chosen": -2.6013948917388916, + "logits/rejected": -2.5942418575286865, + "logps/chosen": -139.40040588378906, + "logps/rejected": -154.91806030273438, + "loss": 0.6538, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -0.8879048228263855, + "rewards/margins": 0.15720801055431366, + "rewards/rejected": -1.0451128482818604, + "step": 5930 + }, + { + "epoch": 1.0234321157822193, + "grad_norm": 18.51112174987793, + "learning_rate": 8.331225661658331e-08, + "logits/chosen": -2.581714153289795, + "logits/rejected": -2.55427622795105, + "logps/chosen": -130.7750244140625, + "logps/rejected": -152.53250122070312, + "loss": 0.6093, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -0.7571039199829102, + "rewards/margins": 0.24412468075752258, + "rewards/rejected": -1.0012285709381104, + "step": 5940 + }, + { + "epoch": 1.0251550654720882, + "grad_norm": 18.122440338134766, + "learning_rate": 8.323743577138949e-08, + "logits/chosen": -2.5277466773986816, + "logits/rejected": -2.5242531299591064, + "logps/chosen": -133.4421844482422, + "logps/rejected": -144.9862518310547, + "loss": 0.6494, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -0.7824922204017639, + "rewards/margins": 0.1383362114429474, + "rewards/rejected": -0.9208283424377441, + "step": 5950 + }, + { + "epoch": 1.0268780151619572, + "grad_norm": 17.398357391357422, + "learning_rate": 8.316248133152979e-08, + "logits/chosen": -2.550398588180542, + "logits/rejected": -2.5142664909362793, + "logps/chosen": -136.10369873046875, + "logps/rejected": -142.1478271484375, + "loss": 0.6522, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -0.7740185260772705, + "rewards/margins": 0.14028708636760712, + "rewards/rejected": -0.914305567741394, + "step": 5960 + }, + { + "epoch": 1.0286009648518264, + "grad_norm": 17.406795501708984, + "learning_rate": 8.308739359827636e-08, + "logits/chosen": -2.5689642429351807, + "logits/rejected": -2.5538182258605957, + "logps/chosen": -126.88874816894531, + "logps/rejected": -144.57144165039062, + "loss": 0.6172, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.738309383392334, + "rewards/margins": 0.21361498534679413, + "rewards/rejected": -0.9519243240356445, + "step": 5970 + }, + { + "epoch": 1.0303239145416954, + "grad_norm": 18.077293395996094, + "learning_rate": 8.301217287343709e-08, + "logits/chosen": -2.5557403564453125, + "logits/rejected": -2.555007219314575, + "logps/chosen": -123.3507308959961, + "logps/rejected": -148.35134887695312, + "loss": 0.6131, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -0.6959012150764465, + "rewards/margins": 0.22886237502098083, + "rewards/rejected": -0.924763560295105, + "step": 5980 + }, + { + "epoch": 1.0320468642315643, + "grad_norm": 18.56474494934082, + "learning_rate": 8.293681945935445e-08, + "logits/chosen": -2.6191112995147705, + "logits/rejected": -2.592783212661743, + "logps/chosen": -124.36668395996094, + "logps/rejected": -140.59884643554688, + "loss": 0.6171, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -0.6880021691322327, + "rewards/margins": 0.2067534476518631, + "rewards/rejected": -0.8947556614875793, + "step": 5990 + }, + { + "epoch": 1.0337698139214335, + "grad_norm": 22.80888557434082, + "learning_rate": 8.286133365890421e-08, + "logits/chosen": -2.588536500930786, + "logits/rejected": -2.578328847885132, + "logps/chosen": -128.1990509033203, + "logps/rejected": -146.2074432373047, + "loss": 0.6293, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.7584198117256165, + "rewards/margins": 0.19320937991142273, + "rewards/rejected": -0.9516291618347168, + "step": 6000 + }, + { + "epoch": 1.0337698139214335, + "eval_logits/chosen": -2.657290458679199, + "eval_logits/rejected": -2.652585506439209, + "eval_logps/chosen": -121.11921691894531, + "eval_logps/rejected": -135.34634399414062, + "eval_loss": 0.6617882251739502, + "eval_rewards/accuracies": 0.6064126491546631, + "eval_rewards/chosen": -0.6210372447967529, + "eval_rewards/margins": 0.10493012517690659, + "eval_rewards/rejected": -0.7259674072265625, + "eval_runtime": 383.1898, + "eval_samples_per_second": 11.232, + "eval_steps_per_second": 1.404, + "step": 6000 + }, + { + "epoch": 1.0354927636113025, + "grad_norm": 28.314002990722656, + "learning_rate": 8.278571577549425e-08, + "logits/chosen": -2.5861663818359375, + "logits/rejected": -2.574615478515625, + "logps/chosen": -134.91171264648438, + "logps/rejected": -141.68955993652344, + "loss": 0.6776, + "rewards/accuracies": 0.543749988079071, + "rewards/chosen": -0.8106623888015747, + "rewards/margins": 0.10338765382766724, + "rewards/rejected": -0.9140501022338867, + "step": 6010 + }, + { + "epoch": 1.0372157133011717, + "grad_norm": 17.736967086791992, + "learning_rate": 8.270996611306335e-08, + "logits/chosen": -2.6942198276519775, + "logits/rejected": -2.6716208457946777, + "logps/chosen": -135.91213989257812, + "logps/rejected": -139.5281982421875, + "loss": 0.6722, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": -0.8005961179733276, + "rewards/margins": 0.10625004768371582, + "rewards/rejected": -0.9068462252616882, + "step": 6020 + }, + { + "epoch": 1.0389386629910407, + "grad_norm": 16.85118865966797, + "learning_rate": 8.263408497607998e-08, + "logits/chosen": -2.481321334838867, + "logits/rejected": -2.4672279357910156, + "logps/chosen": -124.10848236083984, + "logps/rejected": -137.87637329101562, + "loss": 0.6532, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -0.7385944128036499, + "rewards/margins": 0.14067426323890686, + "rewards/rejected": -0.8792687654495239, + "step": 6030 + }, + { + "epoch": 1.0406616126809096, + "grad_norm": 20.27057647705078, + "learning_rate": 8.255807266954104e-08, + "logits/chosen": -2.610898733139038, + "logits/rejected": -2.5935518741607666, + "logps/chosen": -119.01715087890625, + "logps/rejected": -129.5271453857422, + "loss": 0.6462, + "rewards/accuracies": 0.6187499761581421, + "rewards/chosen": -0.6490130424499512, + "rewards/margins": 0.14013883471488953, + "rewards/rejected": -0.7891519069671631, + "step": 6040 + }, + { + "epoch": 1.0423845623707788, + "grad_norm": 19.23181915283203, + "learning_rate": 8.248192949897068e-08, + "logits/chosen": -2.5512688159942627, + "logits/rejected": -2.530177593231201, + "logps/chosen": -131.2162628173828, + "logps/rejected": -142.27493286132812, + "loss": 0.6241, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.7078680992126465, + "rewards/margins": 0.1947457492351532, + "rewards/rejected": -0.9026137590408325, + "step": 6050 + }, + { + "epoch": 1.0441075120606478, + "grad_norm": 18.816614151000977, + "learning_rate": 8.2405655770419e-08, + "logits/chosen": -2.581057071685791, + "logits/rejected": -2.5660622119903564, + "logps/chosen": -126.4916763305664, + "logps/rejected": -138.13027954101562, + "loss": 0.6497, + "rewards/accuracies": 0.6187499761581421, + "rewards/chosen": -0.724175751209259, + "rewards/margins": 0.14416101574897766, + "rewards/rejected": -0.8683366775512695, + "step": 6060 + }, + { + "epoch": 1.045830461750517, + "grad_norm": 20.96912956237793, + "learning_rate": 8.232925179046092e-08, + "logits/chosen": -2.58642578125, + "logits/rejected": -2.5690746307373047, + "logps/chosen": -124.3506851196289, + "logps/rejected": -137.90533447265625, + "loss": 0.6304, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -0.6937921047210693, + "rewards/margins": 0.17554596066474915, + "rewards/rejected": -0.8693380355834961, + "step": 6070 + }, + { + "epoch": 1.047553411440386, + "grad_norm": 14.86324405670166, + "learning_rate": 8.225271786619485e-08, + "logits/chosen": -2.5744924545288086, + "logits/rejected": -2.5600318908691406, + "logps/chosen": -129.59201049804688, + "logps/rejected": -138.97303771972656, + "loss": 0.6301, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -0.6922917366027832, + "rewards/margins": 0.17595288157463074, + "rewards/rejected": -0.8682445287704468, + "step": 6080 + }, + { + "epoch": 1.049276361130255, + "grad_norm": 21.273021697998047, + "learning_rate": 8.217605430524151e-08, + "logits/chosen": -2.6182470321655273, + "logits/rejected": -2.5997517108917236, + "logps/chosen": -120.73951721191406, + "logps/rejected": -140.35650634765625, + "loss": 0.6114, + "rewards/accuracies": 0.71875, + "rewards/chosen": -0.6846445798873901, + "rewards/margins": 0.21238479018211365, + "rewards/rejected": -0.8970292806625366, + "step": 6090 + }, + { + "epoch": 1.050999310820124, + "grad_norm": 17.90481185913086, + "learning_rate": 8.209926141574268e-08, + "logits/chosen": -2.595726728439331, + "logits/rejected": -2.5896270275115967, + "logps/chosen": -132.58290100097656, + "logps/rejected": -152.82553100585938, + "loss": 0.6048, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -0.7439484000205994, + "rewards/margins": 0.2570769190788269, + "rewards/rejected": -1.0010253190994263, + "step": 6100 + }, + { + "epoch": 1.052722260509993, + "grad_norm": 23.34183120727539, + "learning_rate": 8.202233950635999e-08, + "logits/chosen": -2.579566240310669, + "logits/rejected": -2.5681982040405273, + "logps/chosen": -130.9108428955078, + "logps/rejected": -156.07015991210938, + "loss": 0.6099, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.7837319374084473, + "rewards/margins": 0.24528345465660095, + "rewards/rejected": -1.029015302658081, + "step": 6110 + }, + { + "epoch": 1.0544452101998623, + "grad_norm": 22.64227294921875, + "learning_rate": 8.194528888627361e-08, + "logits/chosen": -2.6670823097229004, + "logits/rejected": -2.6301536560058594, + "logps/chosen": -136.83065795898438, + "logps/rejected": -163.07357788085938, + "loss": 0.5872, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -0.8139989972114563, + "rewards/margins": 0.2943005859851837, + "rewards/rejected": -1.1082994937896729, + "step": 6120 + }, + { + "epoch": 1.0561681598897312, + "grad_norm": 20.53219985961914, + "learning_rate": 8.186810986518112e-08, + "logits/chosen": -2.5834572315216064, + "logits/rejected": -2.566066265106201, + "logps/chosen": -145.5009307861328, + "logps/rejected": -160.21743774414062, + "loss": 0.6424, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -0.9014961123466492, + "rewards/margins": 0.16748717427253723, + "rewards/rejected": -1.0689833164215088, + "step": 6130 + }, + { + "epoch": 1.0578911095796002, + "grad_norm": 30.76763153076172, + "learning_rate": 8.179080275329606e-08, + "logits/chosen": -2.594026803970337, + "logits/rejected": -2.583535671234131, + "logps/chosen": -141.31610107421875, + "logps/rejected": -161.57565307617188, + "loss": 0.626, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -0.8761361837387085, + "rewards/margins": 0.223777174949646, + "rewards/rejected": -1.0999133586883545, + "step": 6140 + }, + { + "epoch": 1.0596140592694694, + "grad_norm": 20.423755645751953, + "learning_rate": 8.171336786134699e-08, + "logits/chosen": -2.5477170944213867, + "logits/rejected": -2.5375678539276123, + "logps/chosen": -145.0775146484375, + "logps/rejected": -158.58587646484375, + "loss": 0.6448, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -0.9156156778335571, + "rewards/margins": 0.18095484375953674, + "rewards/rejected": -1.096570611000061, + "step": 6150 + }, + { + "epoch": 1.0613370089593384, + "grad_norm": 24.544588088989258, + "learning_rate": 8.163580550057596e-08, + "logits/chosen": -2.5236897468566895, + "logits/rejected": -2.5145726203918457, + "logps/chosen": -146.05165100097656, + "logps/rejected": -162.24951171875, + "loss": 0.6453, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -0.9362128973007202, + "rewards/margins": 0.16781552135944366, + "rewards/rejected": -1.104028582572937, + "step": 6160 + }, + { + "epoch": 1.0630599586492075, + "grad_norm": 25.148283004760742, + "learning_rate": 8.155811598273737e-08, + "logits/chosen": -2.6110711097717285, + "logits/rejected": -2.5971312522888184, + "logps/chosen": -160.1621551513672, + "logps/rejected": -181.1752166748047, + "loss": 0.6112, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -1.0028265714645386, + "rewards/margins": 0.24775870144367218, + "rewards/rejected": -1.2505853176116943, + "step": 6170 + }, + { + "epoch": 1.0647829083390765, + "grad_norm": 25.57758903503418, + "learning_rate": 8.148029962009677e-08, + "logits/chosen": -2.6012930870056152, + "logits/rejected": -2.585240125656128, + "logps/chosen": -161.4896697998047, + "logps/rejected": -173.24032592773438, + "loss": 0.6366, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -1.0221115350723267, + "rewards/margins": 0.1880497932434082, + "rewards/rejected": -1.2101614475250244, + "step": 6180 + }, + { + "epoch": 1.0665058580289455, + "grad_norm": 26.79003143310547, + "learning_rate": 8.140235672542951e-08, + "logits/chosen": -2.5914082527160645, + "logits/rejected": -2.5720105171203613, + "logps/chosen": -165.79928588867188, + "logps/rejected": -176.74075317382812, + "loss": 0.6417, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -1.0590981245040894, + "rewards/margins": 0.18660911917686462, + "rewards/rejected": -1.2457071542739868, + "step": 6190 + }, + { + "epoch": 1.0682288077188147, + "grad_norm": 33.11368942260742, + "learning_rate": 8.132428761201953e-08, + "logits/chosen": -2.5066819190979004, + "logits/rejected": -2.489210605621338, + "logps/chosen": -157.1233673095703, + "logps/rejected": -180.30471801757812, + "loss": 0.618, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -1.0171107053756714, + "rewards/margins": 0.24292024970054626, + "rewards/rejected": -1.26003098487854, + "step": 6200 + }, + { + "epoch": 1.0699517574086836, + "grad_norm": 27.38452911376953, + "learning_rate": 8.124609259365812e-08, + "logits/chosen": -2.5813941955566406, + "logits/rejected": -2.5623717308044434, + "logps/chosen": -157.97499084472656, + "logps/rejected": -175.90878295898438, + "loss": 0.615, + "rewards/accuracies": 0.71875, + "rewards/chosen": -1.016340970993042, + "rewards/margins": 0.21486850082874298, + "rewards/rejected": -1.231209397315979, + "step": 6210 + }, + { + "epoch": 1.0716747070985528, + "grad_norm": 28.536115646362305, + "learning_rate": 8.116777198464257e-08, + "logits/chosen": -2.5717034339904785, + "logits/rejected": -2.557497024536133, + "logps/chosen": -155.28977966308594, + "logps/rejected": -172.4293975830078, + "loss": 0.6368, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -1.0214192867279053, + "rewards/margins": 0.1867234855890274, + "rewards/rejected": -1.208142876625061, + "step": 6220 + }, + { + "epoch": 1.0733976567884218, + "grad_norm": 20.48895263671875, + "learning_rate": 8.108932609977504e-08, + "logits/chosen": -2.661949872970581, + "logits/rejected": -2.6440796852111816, + "logps/chosen": -153.58737182617188, + "logps/rejected": -176.33309936523438, + "loss": 0.6006, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.9294829368591309, + "rewards/margins": 0.27870726585388184, + "rewards/rejected": -1.2081902027130127, + "step": 6230 + }, + { + "epoch": 1.0751206064782908, + "grad_norm": 25.738983154296875, + "learning_rate": 8.101075525436121e-08, + "logits/chosen": -2.5315258502960205, + "logits/rejected": -2.5130081176757812, + "logps/chosen": -155.04676818847656, + "logps/rejected": -167.66976928710938, + "loss": 0.6498, + "rewards/accuracies": 0.606249988079071, + "rewards/chosen": -0.9809181094169617, + "rewards/margins": 0.1655711978673935, + "rewards/rejected": -1.1464893817901611, + "step": 6240 + }, + { + "epoch": 1.07684355616816, + "grad_norm": 51.1860237121582, + "learning_rate": 8.093205976420896e-08, + "logits/chosen": -2.6003103256225586, + "logits/rejected": -2.580641269683838, + "logps/chosen": -155.04019165039062, + "logps/rejected": -162.6709747314453, + "loss": 0.6694, + "rewards/accuracies": 0.581250011920929, + "rewards/chosen": -0.9813059568405151, + "rewards/margins": 0.11345939338207245, + "rewards/rejected": -1.0947654247283936, + "step": 6250 + }, + { + "epoch": 1.078566505858029, + "grad_norm": 19.37555503845215, + "learning_rate": 8.085323994562727e-08, + "logits/chosen": -2.5086302757263184, + "logits/rejected": -2.489610195159912, + "logps/chosen": -148.66253662109375, + "logps/rejected": -165.6138916015625, + "loss": 0.6275, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -0.9338725805282593, + "rewards/margins": 0.216073676943779, + "rewards/rejected": -1.1499463319778442, + "step": 6260 + }, + { + "epoch": 1.080289455547898, + "grad_norm": 15.658135414123535, + "learning_rate": 8.077429611542476e-08, + "logits/chosen": -2.7011168003082275, + "logits/rejected": -2.7036242485046387, + "logps/chosen": -140.63345336914062, + "logps/rejected": -156.34402465820312, + "loss": 0.6509, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -0.8695812225341797, + "rewards/margins": 0.15376465022563934, + "rewards/rejected": -1.023345947265625, + "step": 6270 + }, + { + "epoch": 1.082012405237767, + "grad_norm": 23.98619270324707, + "learning_rate": 8.069522859090856e-08, + "logits/chosen": -2.4844882488250732, + "logits/rejected": -2.4635424613952637, + "logps/chosen": -136.20181274414062, + "logps/rejected": -150.969482421875, + "loss": 0.6376, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.8131068348884583, + "rewards/margins": 0.1791612058877945, + "rewards/rejected": -0.9922679662704468, + "step": 6280 + }, + { + "epoch": 1.083735354927636, + "grad_norm": 19.53284454345703, + "learning_rate": 8.061603768988294e-08, + "logits/chosen": -2.5444564819335938, + "logits/rejected": -2.523756742477417, + "logps/chosen": -130.36697387695312, + "logps/rejected": -151.30662536621094, + "loss": 0.6153, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -0.781546950340271, + "rewards/margins": 0.21957603096961975, + "rewards/rejected": -1.001123070716858, + "step": 6290 + }, + { + "epoch": 1.0854583046175053, + "grad_norm": 21.767004013061523, + "learning_rate": 8.053672373064811e-08, + "logits/chosen": -2.5796427726745605, + "logits/rejected": -2.563560962677002, + "logps/chosen": -141.37403869628906, + "logps/rejected": -155.7559356689453, + "loss": 0.6416, + "rewards/accuracies": 0.6187499761581421, + "rewards/chosen": -0.8766446113586426, + "rewards/margins": 0.17183735966682434, + "rewards/rejected": -1.0484821796417236, + "step": 6300 + }, + { + "epoch": 1.0871812543073742, + "grad_norm": 26.944143295288086, + "learning_rate": 8.045728703199885e-08, + "logits/chosen": -2.5742251873016357, + "logits/rejected": -2.551163673400879, + "logps/chosen": -132.3316650390625, + "logps/rejected": -145.6800994873047, + "loss": 0.6559, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -0.8050101399421692, + "rewards/margins": 0.14063867926597595, + "rewards/rejected": -0.9456488490104675, + "step": 6310 + }, + { + "epoch": 1.0889042039972432, + "grad_norm": 20.394813537597656, + "learning_rate": 8.037772791322331e-08, + "logits/chosen": -2.528676748275757, + "logits/rejected": -2.513557195663452, + "logps/chosen": -134.52194213867188, + "logps/rejected": -151.3898468017578, + "loss": 0.63, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -0.7903624773025513, + "rewards/margins": 0.19733569025993347, + "rewards/rejected": -0.9876980781555176, + "step": 6320 + }, + { + "epoch": 1.0906271536871124, + "grad_norm": 20.387157440185547, + "learning_rate": 8.029804669410171e-08, + "logits/chosen": -2.5528550148010254, + "logits/rejected": -2.5368423461914062, + "logps/chosen": -132.63211059570312, + "logps/rejected": -158.43017578125, + "loss": 0.5921, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.777782678604126, + "rewards/margins": 0.2880386710166931, + "rewards/rejected": -1.0658212900161743, + "step": 6330 + }, + { + "epoch": 1.0923501033769814, + "grad_norm": 17.525043487548828, + "learning_rate": 8.0218243694905e-08, + "logits/chosen": -2.5942163467407227, + "logits/rejected": -2.576354503631592, + "logps/chosen": -134.7543182373047, + "logps/rejected": -147.21018981933594, + "loss": 0.6359, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -0.7760751843452454, + "rewards/margins": 0.17111438512802124, + "rewards/rejected": -0.9471896886825562, + "step": 6340 + }, + { + "epoch": 1.0940730530668505, + "grad_norm": 30.56574058532715, + "learning_rate": 8.013831923639363e-08, + "logits/chosen": -2.551419734954834, + "logits/rejected": -2.545924663543701, + "logps/chosen": -134.21063232421875, + "logps/rejected": -154.470703125, + "loss": 0.6198, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -0.7749854922294617, + "rewards/margins": 0.21509429812431335, + "rewards/rejected": -0.9900798797607422, + "step": 6350 + }, + { + "epoch": 1.0957960027567195, + "grad_norm": 23.940534591674805, + "learning_rate": 8.005827363981626e-08, + "logits/chosen": -2.536100149154663, + "logits/rejected": -2.53417706489563, + "logps/chosen": -134.65310668945312, + "logps/rejected": -154.59707641601562, + "loss": 0.6298, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -0.7997391819953918, + "rewards/margins": 0.2075682133436203, + "rewards/rejected": -1.0073074102401733, + "step": 6360 + }, + { + "epoch": 1.0975189524465885, + "grad_norm": 29.463321685791016, + "learning_rate": 7.997810722690845e-08, + "logits/chosen": -2.559302806854248, + "logits/rejected": -2.555941104888916, + "logps/chosen": -141.52078247070312, + "logps/rejected": -153.58714294433594, + "loss": 0.6646, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -0.8632817268371582, + "rewards/margins": 0.13526658713817596, + "rewards/rejected": -0.9985483288764954, + "step": 6370 + }, + { + "epoch": 1.0992419021364577, + "grad_norm": 33.703369140625, + "learning_rate": 7.989782031989135e-08, + "logits/chosen": -2.589667320251465, + "logits/rejected": -2.5763516426086426, + "logps/chosen": -147.19015502929688, + "logps/rejected": -170.93853759765625, + "loss": 0.6199, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.9342149496078491, + "rewards/margins": 0.21943017840385437, + "rewards/rejected": -1.1536452770233154, + "step": 6380 + }, + { + "epoch": 1.1009648518263266, + "grad_norm": 23.249961853027344, + "learning_rate": 7.981741324147043e-08, + "logits/chosen": -2.6110785007476807, + "logits/rejected": -2.5831587314605713, + "logps/chosen": -142.32308959960938, + "logps/rejected": -159.36021423339844, + "loss": 0.6117, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -0.84880131483078, + "rewards/margins": 0.2458713948726654, + "rewards/rejected": -1.0946727991104126, + "step": 6390 + }, + { + "epoch": 1.1026878015161956, + "grad_norm": 23.310190200805664, + "learning_rate": 7.973688631483421e-08, + "logits/chosen": -2.5864102840423584, + "logits/rejected": -2.566074848175049, + "logps/chosen": -140.55307006835938, + "logps/rejected": -158.64735412597656, + "loss": 0.6247, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -0.8174555897712708, + "rewards/margins": 0.21133661270141602, + "rewards/rejected": -1.028792142868042, + "step": 6400 + }, + { + "epoch": 1.1026878015161956, + "eval_logits/chosen": -2.625418186187744, + "eval_logits/rejected": -2.6201014518737793, + "eval_logps/chosen": -129.89840698242188, + "eval_logps/rejected": -145.43096923828125, + "eval_loss": 0.6587028503417969, + "eval_rewards/accuracies": 0.5989776849746704, + "eval_rewards/chosen": -0.7088292837142944, + "eval_rewards/margins": 0.1179843619465828, + "eval_rewards/rejected": -0.8268135786056519, + "eval_runtime": 383.4934, + "eval_samples_per_second": 11.223, + "eval_steps_per_second": 1.403, + "step": 6400 + }, + { + "epoch": 1.1044107512060648, + "grad_norm": 30.746761322021484, + "learning_rate": 7.965623986365286e-08, + "logits/chosen": -2.632479190826416, + "logits/rejected": -2.614830493927002, + "logps/chosen": -147.27743530273438, + "logps/rejected": -161.5043487548828, + "loss": 0.6384, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.9128227233886719, + "rewards/margins": 0.19528654217720032, + "rewards/rejected": -1.1081092357635498, + "step": 6410 + }, + { + "epoch": 1.1061337008959338, + "grad_norm": 24.96663475036621, + "learning_rate": 7.957547421207705e-08, + "logits/chosen": -2.5945236682891846, + "logits/rejected": -2.5781524181365967, + "logps/chosen": -148.02255249023438, + "logps/rejected": -160.6229248046875, + "loss": 0.655, + "rewards/accuracies": 0.59375, + "rewards/chosen": -0.9230918884277344, + "rewards/margins": 0.15126900374889374, + "rewards/rejected": -1.0743608474731445, + "step": 6420 + }, + { + "epoch": 1.107856650585803, + "grad_norm": 18.77534294128418, + "learning_rate": 7.949458968473649e-08, + "logits/chosen": -2.5306360721588135, + "logits/rejected": -2.5238285064697266, + "logps/chosen": -137.1156463623047, + "logps/rejected": -143.7222442626953, + "loss": 0.6782, + "rewards/accuracies": 0.53125, + "rewards/chosen": -0.86149662733078, + "rewards/margins": 0.09484690427780151, + "rewards/rejected": -0.9563434720039368, + "step": 6430 + }, + { + "epoch": 1.109579600275672, + "grad_norm": 19.45603370666504, + "learning_rate": 7.941358660673876e-08, + "logits/chosen": -2.561483860015869, + "logits/rejected": -2.5457534790039062, + "logps/chosen": -141.83096313476562, + "logps/rejected": -156.63902282714844, + "loss": 0.644, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": -0.8634850382804871, + "rewards/margins": 0.1716698855161667, + "rewards/rejected": -1.035154938697815, + "step": 6440 + }, + { + "epoch": 1.111302549965541, + "grad_norm": 19.846593856811523, + "learning_rate": 7.933246530366788e-08, + "logits/chosen": -2.5645689964294434, + "logits/rejected": -2.540552854537964, + "logps/chosen": -139.06712341308594, + "logps/rejected": -154.57325744628906, + "loss": 0.6241, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -0.8170877695083618, + "rewards/margins": 0.2099314033985138, + "rewards/rejected": -1.0270192623138428, + "step": 6450 + }, + { + "epoch": 1.11302549965541, + "grad_norm": 19.40055274963379, + "learning_rate": 7.925122610158315e-08, + "logits/chosen": -2.513282537460327, + "logits/rejected": -2.5152535438537598, + "logps/chosen": -131.2607421875, + "logps/rejected": -158.49551391601562, + "loss": 0.6154, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -0.7849147915840149, + "rewards/margins": 0.2336963713169098, + "rewards/rejected": -1.018611192703247, + "step": 6460 + }, + { + "epoch": 1.114748449345279, + "grad_norm": 22.3400821685791, + "learning_rate": 7.916986932701766e-08, + "logits/chosen": -2.485816717147827, + "logits/rejected": -2.4699718952178955, + "logps/chosen": -136.15518188476562, + "logps/rejected": -151.83595275878906, + "loss": 0.6355, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -0.8198622465133667, + "rewards/margins": 0.18899795413017273, + "rewards/rejected": -1.0088602304458618, + "step": 6470 + }, + { + "epoch": 1.1164713990351482, + "grad_norm": 24.075334548950195, + "learning_rate": 7.908839530697713e-08, + "logits/chosen": -2.558938503265381, + "logits/rejected": -2.5304675102233887, + "logps/chosen": -135.81005859375, + "logps/rejected": -146.69322204589844, + "loss": 0.6372, + "rewards/accuracies": 0.606249988079071, + "rewards/chosen": -0.7854007482528687, + "rewards/margins": 0.17798066139221191, + "rewards/rejected": -0.9633814096450806, + "step": 6480 + }, + { + "epoch": 1.1181943487250172, + "grad_norm": 20.839139938354492, + "learning_rate": 7.900680436893852e-08, + "logits/chosen": -2.6834394931793213, + "logits/rejected": -2.674525022506714, + "logps/chosen": -141.89013671875, + "logps/rejected": -154.45413208007812, + "loss": 0.6588, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": -0.8717700839042664, + "rewards/margins": 0.13743311166763306, + "rewards/rejected": -1.009203314781189, + "step": 6490 + }, + { + "epoch": 1.1199172984148862, + "grad_norm": 17.810470581054688, + "learning_rate": 7.892509684084874e-08, + "logits/chosen": -2.5747642517089844, + "logits/rejected": -2.5670723915100098, + "logps/chosen": -143.84841918945312, + "logps/rejected": -151.48602294921875, + "loss": 0.6572, + "rewards/accuracies": 0.6187499761581421, + "rewards/chosen": -0.864587128162384, + "rewards/margins": 0.1298464834690094, + "rewards/rejected": -0.9944335222244263, + "step": 6500 + }, + { + "epoch": 1.1216402481047554, + "grad_norm": 26.317533493041992, + "learning_rate": 7.884327305112332e-08, + "logits/chosen": -2.566572904586792, + "logits/rejected": -2.5307557582855225, + "logps/chosen": -137.45692443847656, + "logps/rejected": -152.34506225585938, + "loss": 0.6282, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -0.8011199831962585, + "rewards/margins": 0.2003363072872162, + "rewards/rejected": -1.0014562606811523, + "step": 6510 + }, + { + "epoch": 1.1233631977946243, + "grad_norm": 22.78864288330078, + "learning_rate": 7.876133332864505e-08, + "logits/chosen": -2.5616345405578613, + "logits/rejected": -2.543246030807495, + "logps/chosen": -129.95115661621094, + "logps/rejected": -144.4560546875, + "loss": 0.6324, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -0.7529776096343994, + "rewards/margins": 0.18292686343193054, + "rewards/rejected": -0.9359043836593628, + "step": 6520 + }, + { + "epoch": 1.1250861474844935, + "grad_norm": 16.22922706604004, + "learning_rate": 7.86792780027628e-08, + "logits/chosen": -2.528315544128418, + "logits/rejected": -2.506397008895874, + "logps/chosen": -135.58074951171875, + "logps/rejected": -153.8533935546875, + "loss": 0.6157, + "rewards/accuracies": 0.65625, + "rewards/chosen": -0.8108615875244141, + "rewards/margins": 0.2233131229877472, + "rewards/rejected": -1.0341746807098389, + "step": 6530 + }, + { + "epoch": 1.1268090971743625, + "grad_norm": 25.679231643676758, + "learning_rate": 7.859710740328998e-08, + "logits/chosen": -2.5504448413848877, + "logits/rejected": -2.5254950523376465, + "logps/chosen": -146.25262451171875, + "logps/rejected": -159.8805389404297, + "loss": 0.6526, + "rewards/accuracies": 0.581250011920929, + "rewards/chosen": -0.8821076154708862, + "rewards/margins": 0.1563064604997635, + "rewards/rejected": -1.0384140014648438, + "step": 6540 + }, + { + "epoch": 1.1285320468642315, + "grad_norm": 31.117042541503906, + "learning_rate": 7.85148218605034e-08, + "logits/chosen": -2.5004849433898926, + "logits/rejected": -2.4817347526550293, + "logps/chosen": -138.51187133789062, + "logps/rejected": -155.68972778320312, + "loss": 0.6447, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -0.8690078854560852, + "rewards/margins": 0.17679616808891296, + "rewards/rejected": -1.0458040237426758, + "step": 6550 + }, + { + "epoch": 1.1302549965541007, + "grad_norm": 27.82855796813965, + "learning_rate": 7.843242170514187e-08, + "logits/chosen": -2.5705184936523438, + "logits/rejected": -2.540701389312744, + "logps/chosen": -141.11805725097656, + "logps/rejected": -158.58779907226562, + "loss": 0.6132, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.8434591293334961, + "rewards/margins": 0.23432815074920654, + "rewards/rejected": -1.077787160873413, + "step": 6560 + }, + { + "epoch": 1.1319779462439696, + "grad_norm": 23.152801513671875, + "learning_rate": 7.834990726840485e-08, + "logits/chosen": -2.5601956844329834, + "logits/rejected": -2.5374298095703125, + "logps/chosen": -139.59695434570312, + "logps/rejected": -154.17977905273438, + "loss": 0.6302, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.8412817716598511, + "rewards/margins": 0.18191157281398773, + "rewards/rejected": -1.023193359375, + "step": 6570 + }, + { + "epoch": 1.1337008959338388, + "grad_norm": 20.162002563476562, + "learning_rate": 7.826727888195118e-08, + "logits/chosen": -2.5681393146514893, + "logits/rejected": -2.5380806922912598, + "logps/chosen": -143.95455932617188, + "logps/rejected": -153.251953125, + "loss": 0.6509, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -0.8760701417922974, + "rewards/margins": 0.1618967354297638, + "rewards/rejected": -1.0379668474197388, + "step": 6580 + }, + { + "epoch": 1.1354238456237078, + "grad_norm": 21.368446350097656, + "learning_rate": 7.818453687789766e-08, + "logits/chosen": -2.5328991413116455, + "logits/rejected": -2.5141239166259766, + "logps/chosen": -137.56971740722656, + "logps/rejected": -156.55111694335938, + "loss": 0.6281, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.8404220342636108, + "rewards/margins": 0.19537512958049774, + "rewards/rejected": -1.035797357559204, + "step": 6590 + }, + { + "epoch": 1.1371467953135768, + "grad_norm": 26.21380043029785, + "learning_rate": 7.81016815888178e-08, + "logits/chosen": -2.591376543045044, + "logits/rejected": -2.5790627002716064, + "logps/chosen": -141.25106811523438, + "logps/rejected": -156.2598419189453, + "loss": 0.6354, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.8445740938186646, + "rewards/margins": 0.18649375438690186, + "rewards/rejected": -1.031067967414856, + "step": 6600 + }, + { + "epoch": 1.138869745003446, + "grad_norm": 21.277738571166992, + "learning_rate": 7.801871334774045e-08, + "logits/chosen": -2.5717265605926514, + "logits/rejected": -2.560377597808838, + "logps/chosen": -136.41635131835938, + "logps/rejected": -154.43130493164062, + "loss": 0.6244, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.8334992527961731, + "rewards/margins": 0.20126286149024963, + "rewards/rejected": -1.0347621440887451, + "step": 6610 + }, + { + "epoch": 1.140592694693315, + "grad_norm": 23.049354553222656, + "learning_rate": 7.793563248814843e-08, + "logits/chosen": -2.515204906463623, + "logits/rejected": -2.496919870376587, + "logps/chosen": -144.44114685058594, + "logps/rejected": -157.16624450683594, + "loss": 0.6591, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -0.9124046564102173, + "rewards/margins": 0.146372988820076, + "rewards/rejected": -1.0587775707244873, + "step": 6620 + }, + { + "epoch": 1.1423156443831841, + "grad_norm": 21.18852996826172, + "learning_rate": 7.785243934397725e-08, + "logits/chosen": -2.518746852874756, + "logits/rejected": -2.5005240440368652, + "logps/chosen": -132.6031036376953, + "logps/rejected": -140.0795135498047, + "loss": 0.6539, + "rewards/accuracies": 0.59375, + "rewards/chosen": -0.7796574831008911, + "rewards/margins": 0.13536986708641052, + "rewards/rejected": -0.9150273203849792, + "step": 6630 + }, + { + "epoch": 1.144038594073053, + "grad_norm": 28.055564880371094, + "learning_rate": 7.776913424961374e-08, + "logits/chosen": -2.5686328411102295, + "logits/rejected": -2.54179310798645, + "logps/chosen": -135.70887756347656, + "logps/rejected": -143.9758758544922, + "loss": 0.6526, + "rewards/accuracies": 0.606249988079071, + "rewards/chosen": -0.8114780187606812, + "rewards/margins": 0.14228160679340363, + "rewards/rejected": -0.9537595510482788, + "step": 6640 + }, + { + "epoch": 1.145761543762922, + "grad_norm": 31.527973175048828, + "learning_rate": 7.768571753989465e-08, + "logits/chosen": -2.5954716205596924, + "logits/rejected": -2.5796637535095215, + "logps/chosen": -131.65628051757812, + "logps/rejected": -151.76107788085938, + "loss": 0.6297, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -0.785808801651001, + "rewards/margins": 0.20410504937171936, + "rewards/rejected": -0.989913821220398, + "step": 6650 + }, + { + "epoch": 1.1474844934527912, + "grad_norm": 24.025869369506836, + "learning_rate": 7.760218955010542e-08, + "logits/chosen": -2.622337818145752, + "logits/rejected": -2.613774061203003, + "logps/chosen": -128.61228942871094, + "logps/rejected": -146.10922241210938, + "loss": 0.6369, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.757174015045166, + "rewards/margins": 0.17353567481040955, + "rewards/rejected": -0.9307096600532532, + "step": 6660 + }, + { + "epoch": 1.1492074431426602, + "grad_norm": 21.362499237060547, + "learning_rate": 7.751855061597875e-08, + "logits/chosen": -2.522671937942505, + "logits/rejected": -2.524449348449707, + "logps/chosen": -130.78817749023438, + "logps/rejected": -161.01031494140625, + "loss": 0.6017, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -0.800433337688446, + "rewards/margins": 0.2665384113788605, + "rewards/rejected": -1.066971778869629, + "step": 6670 + }, + { + "epoch": 1.1509303928325294, + "grad_norm": 25.16261863708496, + "learning_rate": 7.743480107369324e-08, + "logits/chosen": -2.521477222442627, + "logits/rejected": -2.5011825561523438, + "logps/chosen": -134.3588104248047, + "logps/rejected": -146.83877563476562, + "loss": 0.6455, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -0.8043110966682434, + "rewards/margins": 0.15868356823921204, + "rewards/rejected": -0.9629947543144226, + "step": 6680 + }, + { + "epoch": 1.1526533425223984, + "grad_norm": 20.07200050354004, + "learning_rate": 7.735094125987214e-08, + "logits/chosen": -2.5675048828125, + "logits/rejected": -2.5460128784179688, + "logps/chosen": -132.4228057861328, + "logps/rejected": -152.199951171875, + "loss": 0.6141, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -0.743118405342102, + "rewards/margins": 0.2335709035396576, + "rewards/rejected": -0.9766892194747925, + "step": 6690 + }, + { + "epoch": 1.1543762922122673, + "grad_norm": 29.890438079833984, + "learning_rate": 7.726697151158183e-08, + "logits/chosen": -2.5477867126464844, + "logits/rejected": -2.539823055267334, + "logps/chosen": -134.93380737304688, + "logps/rejected": -154.91146850585938, + "loss": 0.6193, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -0.7691043615341187, + "rewards/margins": 0.22836211323738098, + "rewards/rejected": -0.9974665641784668, + "step": 6700 + }, + { + "epoch": 1.1560992419021365, + "grad_norm": 19.754623413085938, + "learning_rate": 7.718289216633063e-08, + "logits/chosen": -2.5491116046905518, + "logits/rejected": -2.5243980884552, + "logps/chosen": -135.69090270996094, + "logps/rejected": -153.7061309814453, + "loss": 0.6247, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.7767504453659058, + "rewards/margins": 0.23671141266822815, + "rewards/rejected": -1.0134618282318115, + "step": 6710 + }, + { + "epoch": 1.1578221915920055, + "grad_norm": 20.112178802490234, + "learning_rate": 7.709870356206736e-08, + "logits/chosen": -2.537121534347534, + "logits/rejected": -2.520521879196167, + "logps/chosen": -129.10336303710938, + "logps/rejected": -148.66500854492188, + "loss": 0.6274, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.7644118070602417, + "rewards/margins": 0.2091284692287445, + "rewards/rejected": -0.9735404253005981, + "step": 6720 + }, + { + "epoch": 1.1595451412818747, + "grad_norm": 20.700651168823242, + "learning_rate": 7.701440603718e-08, + "logits/chosen": -2.510880947113037, + "logits/rejected": -2.4959218502044678, + "logps/chosen": -140.76002502441406, + "logps/rejected": -150.38844299316406, + "loss": 0.6665, + "rewards/accuracies": 0.59375, + "rewards/chosen": -0.8319160342216492, + "rewards/margins": 0.1302139014005661, + "rewards/rejected": -0.9621298909187317, + "step": 6730 + }, + { + "epoch": 1.1612680909717437, + "grad_norm": 25.546266555786133, + "learning_rate": 7.692999993049429e-08, + "logits/chosen": -2.5427424907684326, + "logits/rejected": -2.535956621170044, + "logps/chosen": -135.4132537841797, + "logps/rejected": -155.85977172851562, + "loss": 0.6182, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -0.800602912902832, + "rewards/margins": 0.21048641204833984, + "rewards/rejected": -1.0110893249511719, + "step": 6740 + }, + { + "epoch": 1.1629910406616126, + "grad_norm": 29.764677047729492, + "learning_rate": 7.684548558127247e-08, + "logits/chosen": -2.5787529945373535, + "logits/rejected": -2.5668704509735107, + "logps/chosen": -140.60609436035156, + "logps/rejected": -163.8345947265625, + "loss": 0.618, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.871651828289032, + "rewards/margins": 0.2363211214542389, + "rewards/rejected": -1.1079729795455933, + "step": 6750 + }, + { + "epoch": 1.1647139903514818, + "grad_norm": 25.136219024658203, + "learning_rate": 7.676086332921176e-08, + "logits/chosen": -2.5359222888946533, + "logits/rejected": -2.515162944793701, + "logps/chosen": -136.893310546875, + "logps/rejected": -152.5428009033203, + "loss": 0.6302, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -0.8350076675415039, + "rewards/margins": 0.20780706405639648, + "rewards/rejected": -1.0428146123886108, + "step": 6760 + }, + { + "epoch": 1.1664369400413508, + "grad_norm": 29.839658737182617, + "learning_rate": 7.667613351444318e-08, + "logits/chosen": -2.556365489959717, + "logits/rejected": -2.548959732055664, + "logps/chosen": -141.61341857910156, + "logps/rejected": -161.94760131835938, + "loss": 0.6314, + "rewards/accuracies": 0.6187499761581421, + "rewards/chosen": -0.8632542490959167, + "rewards/margins": 0.20508262515068054, + "rewards/rejected": -1.068337082862854, + "step": 6770 + }, + { + "epoch": 1.1681598897312198, + "grad_norm": 29.74138069152832, + "learning_rate": 7.659129647753002e-08, + "logits/chosen": -2.5341110229492188, + "logits/rejected": -2.511051654815674, + "logps/chosen": -147.91268920898438, + "logps/rejected": -160.97518920898438, + "loss": 0.6452, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.9066078066825867, + "rewards/margins": 0.17172715067863464, + "rewards/rejected": -1.078334927558899, + "step": 6780 + }, + { + "epoch": 1.169882839421089, + "grad_norm": 26.996421813964844, + "learning_rate": 7.650635255946658e-08, + "logits/chosen": -2.5564591884613037, + "logits/rejected": -2.541588306427002, + "logps/chosen": -138.55068969726562, + "logps/rejected": -167.80386352539062, + "loss": 0.5865, + "rewards/accuracies": 0.71875, + "rewards/chosen": -0.8645964860916138, + "rewards/margins": 0.29569393396377563, + "rewards/rejected": -1.1602903604507446, + "step": 6790 + }, + { + "epoch": 1.171605789110958, + "grad_norm": 20.826135635375977, + "learning_rate": 7.642130210167673e-08, + "logits/chosen": -2.4886882305145264, + "logits/rejected": -2.4650111198425293, + "logps/chosen": -145.65353393554688, + "logps/rejected": -165.99551391601562, + "loss": 0.6194, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -0.9237545132637024, + "rewards/margins": 0.2404859960079193, + "rewards/rejected": -1.1642405986785889, + "step": 6800 + }, + { + "epoch": 1.171605789110958, + "eval_logits/chosen": -2.591191291809082, + "eval_logits/rejected": -2.585751533508301, + "eval_logps/chosen": -138.56918334960938, + "eval_logps/rejected": -154.65988159179688, + "eval_loss": 0.6580451130867004, + "eval_rewards/accuracies": 0.5980483293533325, + "eval_rewards/chosen": -0.7955370545387268, + "eval_rewards/margins": 0.1235656887292862, + "eval_rewards/rejected": -0.9191027879714966, + "eval_runtime": 382.8732, + "eval_samples_per_second": 11.241, + "eval_steps_per_second": 1.405, + "step": 6800 + }, + { + "epoch": 1.173328738800827, + "grad_norm": 23.784109115600586, + "learning_rate": 7.633614544601257e-08, + "logits/chosen": -2.5083320140838623, + "logits/rejected": -2.4918289184570312, + "logps/chosen": -154.79818725585938, + "logps/rejected": -169.489501953125, + "loss": 0.6279, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.9800655245780945, + "rewards/margins": 0.20611277222633362, + "rewards/rejected": -1.18617844581604, + "step": 6810 + }, + { + "epoch": 1.175051688490696, + "grad_norm": 22.796245574951172, + "learning_rate": 7.625088293475308e-08, + "logits/chosen": -2.5985751152038574, + "logits/rejected": -2.5754246711730957, + "logps/chosen": -153.35060119628906, + "logps/rejected": -170.70590209960938, + "loss": 0.626, + "rewards/accuracies": 0.65625, + "rewards/chosen": -0.9690364599227905, + "rewards/margins": 0.2199009358882904, + "rewards/rejected": -1.1889374256134033, + "step": 6820 + }, + { + "epoch": 1.176774638180565, + "grad_norm": 17.840660095214844, + "learning_rate": 7.61655149106027e-08, + "logits/chosen": -2.5838685035705566, + "logits/rejected": -2.5834288597106934, + "logps/chosen": -149.63232421875, + "logps/rejected": -167.7073211669922, + "loss": 0.6402, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -0.9502065777778625, + "rewards/margins": 0.20394647121429443, + "rewards/rejected": -1.1541529893875122, + "step": 6830 + }, + { + "epoch": 1.1784975878704342, + "grad_norm": 21.295618057250977, + "learning_rate": 7.608004171668994e-08, + "logits/chosen": -2.567112684249878, + "logits/rejected": -2.5470948219299316, + "logps/chosen": -154.1027374267578, + "logps/rejected": -169.5994415283203, + "loss": 0.6509, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -0.9616118669509888, + "rewards/margins": 0.18688850104808807, + "rewards/rejected": -1.1485002040863037, + "step": 6840 + }, + { + "epoch": 1.1802205375603032, + "grad_norm": 27.233596801757812, + "learning_rate": 7.599446369656608e-08, + "logits/chosen": -2.4759891033172607, + "logits/rejected": -2.4508557319641113, + "logps/chosen": -147.71963500976562, + "logps/rejected": -165.37037658691406, + "loss": 0.6399, + "rewards/accuracies": 0.6187499761581421, + "rewards/chosen": -0.923791229724884, + "rewards/margins": 0.20322775840759277, + "rewards/rejected": -1.127018928527832, + "step": 6850 + }, + { + "epoch": 1.1819434872501722, + "grad_norm": 21.57741928100586, + "learning_rate": 7.59087811942037e-08, + "logits/chosen": -2.5545706748962402, + "logits/rejected": -2.5255355834960938, + "logps/chosen": -153.35557556152344, + "logps/rejected": -164.24728393554688, + "loss": 0.627, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -0.9340866804122925, + "rewards/margins": 0.19009587168693542, + "rewards/rejected": -1.1241824626922607, + "step": 6860 + }, + { + "epoch": 1.1836664369400414, + "grad_norm": 25.994808197021484, + "learning_rate": 7.582299455399536e-08, + "logits/chosen": -2.476388454437256, + "logits/rejected": -2.4694392681121826, + "logps/chosen": -140.9687957763672, + "logps/rejected": -158.087158203125, + "loss": 0.6374, + "rewards/accuracies": 0.65625, + "rewards/chosen": -0.8894684910774231, + "rewards/margins": 0.17998628318309784, + "rewards/rejected": -1.0694547891616821, + "step": 6870 + }, + { + "epoch": 1.1853893866299103, + "grad_norm": 21.931888580322266, + "learning_rate": 7.573710412075218e-08, + "logits/chosen": -2.561197280883789, + "logits/rejected": -2.5365850925445557, + "logps/chosen": -141.51438903808594, + "logps/rejected": -157.063720703125, + "loss": 0.6289, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.8552343249320984, + "rewards/margins": 0.20935793220996857, + "rewards/rejected": -1.0645922422409058, + "step": 6880 + }, + { + "epoch": 1.1871123363197795, + "grad_norm": 18.84161376953125, + "learning_rate": 7.565111023970246e-08, + "logits/chosen": -2.5000433921813965, + "logits/rejected": -2.4791622161865234, + "logps/chosen": -133.22152709960938, + "logps/rejected": -156.64381408691406, + "loss": 0.6162, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -0.7999873757362366, + "rewards/margins": 0.24644342064857483, + "rewards/rejected": -1.0464308261871338, + "step": 6890 + }, + { + "epoch": 1.1888352860096485, + "grad_norm": 27.533275604248047, + "learning_rate": 7.556501325649031e-08, + "logits/chosen": -2.547579288482666, + "logits/rejected": -2.5297350883483887, + "logps/chosen": -138.42526245117188, + "logps/rejected": -156.234375, + "loss": 0.6339, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -0.8409918546676636, + "rewards/margins": 0.19497635960578918, + "rewards/rejected": -1.0359681844711304, + "step": 6900 + }, + { + "epoch": 1.1905582356995175, + "grad_norm": 22.588600158691406, + "learning_rate": 7.547881351717425e-08, + "logits/chosen": -2.5680556297302246, + "logits/rejected": -2.5507588386535645, + "logps/chosen": -142.8251953125, + "logps/rejected": -162.5928192138672, + "loss": 0.6169, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -0.8501895070075989, + "rewards/margins": 0.2276049107313156, + "rewards/rejected": -1.0777945518493652, + "step": 6910 + }, + { + "epoch": 1.1922811853893867, + "grad_norm": 27.605674743652344, + "learning_rate": 7.539251136822582e-08, + "logits/chosen": -2.6103172302246094, + "logits/rejected": -2.5834383964538574, + "logps/chosen": -148.08206176757812, + "logps/rejected": -162.20498657226562, + "loss": 0.6456, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -0.9144037961959839, + "rewards/margins": 0.1720445156097412, + "rewards/rejected": -1.086448311805725, + "step": 6920 + }, + { + "epoch": 1.1940041350792556, + "grad_norm": 24.64449119567871, + "learning_rate": 7.530610715652816e-08, + "logits/chosen": -2.507188320159912, + "logits/rejected": -2.478717088699341, + "logps/chosen": -139.48257446289062, + "logps/rejected": -160.83448791503906, + "loss": 0.6039, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -0.8062465786933899, + "rewards/margins": 0.2692093253135681, + "rewards/rejected": -1.0754557847976685, + "step": 6930 + }, + { + "epoch": 1.1957270847691248, + "grad_norm": 25.173749923706055, + "learning_rate": 7.521960122937469e-08, + "logits/chosen": -2.4724910259246826, + "logits/rejected": -2.4382050037384033, + "logps/chosen": -141.81011962890625, + "logps/rejected": -157.2001495361328, + "loss": 0.6054, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -0.8174602389335632, + "rewards/margins": 0.2571207880973816, + "rewards/rejected": -1.0745811462402344, + "step": 6940 + }, + { + "epoch": 1.1974500344589938, + "grad_norm": 24.128093719482422, + "learning_rate": 7.513299393446761e-08, + "logits/chosen": -2.5183544158935547, + "logits/rejected": -2.5030174255371094, + "logps/chosen": -143.48355102539062, + "logps/rejected": -169.42686462402344, + "loss": 0.6007, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -0.8923398852348328, + "rewards/margins": 0.27517181634902954, + "rewards/rejected": -1.1675117015838623, + "step": 6950 + }, + { + "epoch": 1.1991729841488628, + "grad_norm": 31.939889907836914, + "learning_rate": 7.504628561991661e-08, + "logits/chosen": -2.626190662384033, + "logits/rejected": -2.5996010303497314, + "logps/chosen": -160.25259399414062, + "logps/rejected": -166.00714111328125, + "loss": 0.6555, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -1.0419825315475464, + "rewards/margins": 0.14706826210021973, + "rewards/rejected": -1.1890507936477661, + "step": 6960 + }, + { + "epoch": 1.200895933838732, + "grad_norm": 24.720861434936523, + "learning_rate": 7.495947663423736e-08, + "logits/chosen": -2.5689542293548584, + "logits/rejected": -2.548269271850586, + "logps/chosen": -145.82464599609375, + "logps/rejected": -160.35841369628906, + "loss": 0.6278, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -0.9043046236038208, + "rewards/margins": 0.1990642249584198, + "rewards/rejected": -1.103368878364563, + "step": 6970 + }, + { + "epoch": 1.202618883528601, + "grad_norm": 18.39963722229004, + "learning_rate": 7.487256732635024e-08, + "logits/chosen": -2.5041344165802, + "logits/rejected": -2.4827022552490234, + "logps/chosen": -144.06503295898438, + "logps/rejected": -163.56167602539062, + "loss": 0.6172, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -0.8870753049850464, + "rewards/margins": 0.2288040816783905, + "rewards/rejected": -1.1158792972564697, + "step": 6980 + }, + { + "epoch": 1.20434183321847, + "grad_norm": 19.67237663269043, + "learning_rate": 7.478555804557881e-08, + "logits/chosen": -2.4596590995788574, + "logits/rejected": -2.449921131134033, + "logps/chosen": -147.87271118164062, + "logps/rejected": -159.21084594726562, + "loss": 0.6508, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -0.8981518745422363, + "rewards/margins": 0.15594662725925446, + "rewards/rejected": -1.05409836769104, + "step": 6990 + }, + { + "epoch": 1.206064782908339, + "grad_norm": 22.669862747192383, + "learning_rate": 7.469844914164847e-08, + "logits/chosen": -2.665203809738159, + "logits/rejected": -2.6429755687713623, + "logps/chosen": -146.79232788085938, + "logps/rejected": -164.08370971679688, + "loss": 0.6235, + "rewards/accuracies": 0.65625, + "rewards/chosen": -0.8958548307418823, + "rewards/margins": 0.20432892441749573, + "rewards/rejected": -1.1001837253570557, + "step": 7000 + }, + { + "epoch": 1.207787732598208, + "grad_norm": 18.724201202392578, + "learning_rate": 7.461124096468505e-08, + "logits/chosen": -2.545606851577759, + "logits/rejected": -2.5247297286987305, + "logps/chosen": -139.4956817626953, + "logps/rejected": -154.31460571289062, + "loss": 0.6278, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -0.8235694766044617, + "rewards/margins": 0.19837680459022522, + "rewards/rejected": -1.0219463109970093, + "step": 7010 + }, + { + "epoch": 1.2095106822880772, + "grad_norm": 24.118534088134766, + "learning_rate": 7.45239338652134e-08, + "logits/chosen": -2.516458034515381, + "logits/rejected": -2.494795322418213, + "logps/chosen": -131.16363525390625, + "logps/rejected": -150.270263671875, + "loss": 0.6322, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -0.7914215326309204, + "rewards/margins": 0.1959744691848755, + "rewards/rejected": -0.9873960614204407, + "step": 7020 + }, + { + "epoch": 1.2112336319779462, + "grad_norm": 21.872209548950195, + "learning_rate": 7.443652819415603e-08, + "logits/chosen": -2.569121837615967, + "logits/rejected": -2.5502028465270996, + "logps/chosen": -136.0243682861328, + "logps/rejected": -152.87554931640625, + "loss": 0.6412, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.8177220225334167, + "rewards/margins": 0.1662174016237259, + "rewards/rejected": -0.9839394688606262, + "step": 7030 + }, + { + "epoch": 1.2129565816678154, + "grad_norm": 38.59823989868164, + "learning_rate": 7.434902430283154e-08, + "logits/chosen": -2.5355591773986816, + "logits/rejected": -2.516512870788574, + "logps/chosen": -137.9992218017578, + "logps/rejected": -156.84129333496094, + "loss": 0.631, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.8045310974121094, + "rewards/margins": 0.20467019081115723, + "rewards/rejected": -1.0092014074325562, + "step": 7040 + }, + { + "epoch": 1.2146795313576844, + "grad_norm": 24.6149959564209, + "learning_rate": 7.426142254295343e-08, + "logits/chosen": -2.507087230682373, + "logits/rejected": -2.4920742511749268, + "logps/chosen": -130.8026123046875, + "logps/rejected": -148.0153350830078, + "loss": 0.6415, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": -0.7819786071777344, + "rewards/margins": 0.1737910807132721, + "rewards/rejected": -0.9557696580886841, + "step": 7050 + }, + { + "epoch": 1.2164024810475533, + "grad_norm": 22.87141990661621, + "learning_rate": 7.417372326662845e-08, + "logits/chosen": -2.562243700027466, + "logits/rejected": -2.5538878440856934, + "logps/chosen": -135.90615844726562, + "logps/rejected": -150.73922729492188, + "loss": 0.6425, + "rewards/accuracies": 0.6187499761581421, + "rewards/chosen": -0.8219138979911804, + "rewards/margins": 0.16739928722381592, + "rewards/rejected": -0.9893131256103516, + "step": 7060 + }, + { + "epoch": 1.2181254307374225, + "grad_norm": 23.218414306640625, + "learning_rate": 7.408592682635546e-08, + "logits/chosen": -2.5308501720428467, + "logits/rejected": -2.513751745223999, + "logps/chosen": -138.0851593017578, + "logps/rejected": -144.14117431640625, + "loss": 0.6813, + "rewards/accuracies": 0.5687500238418579, + "rewards/chosen": -0.8276640176773071, + "rewards/margins": 0.08705426007509232, + "rewards/rejected": -0.9147183299064636, + "step": 7070 + }, + { + "epoch": 1.2198483804272915, + "grad_norm": 25.701818466186523, + "learning_rate": 7.399803357502372e-08, + "logits/chosen": -2.5900769233703613, + "logits/rejected": -2.566382884979248, + "logps/chosen": -128.85079956054688, + "logps/rejected": -145.69757080078125, + "loss": 0.6356, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.7443950176239014, + "rewards/margins": 0.17536406219005585, + "rewards/rejected": -0.9197589755058289, + "step": 7080 + }, + { + "epoch": 1.2215713301171607, + "grad_norm": 42.72899627685547, + "learning_rate": 7.391004386591171e-08, + "logits/chosen": -2.6033012866973877, + "logits/rejected": -2.595266342163086, + "logps/chosen": -128.8573455810547, + "logps/rejected": -147.2779541015625, + "loss": 0.6188, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.7513642311096191, + "rewards/margins": 0.21458382904529572, + "rewards/rejected": -0.9659481048583984, + "step": 7090 + }, + { + "epoch": 1.2232942798070296, + "grad_norm": 20.889738082885742, + "learning_rate": 7.382195805268555e-08, + "logits/chosen": -2.518144369125366, + "logits/rejected": -2.4989988803863525, + "logps/chosen": -133.90628051757812, + "logps/rejected": -151.20870971679688, + "loss": 0.6344, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -0.7652236223220825, + "rewards/margins": 0.18906177580356598, + "rewards/rejected": -0.9542854428291321, + "step": 7100 + }, + { + "epoch": 1.2250172294968986, + "grad_norm": 18.80124855041504, + "learning_rate": 7.373377648939768e-08, + "logits/chosen": -2.5407471656799316, + "logits/rejected": -2.517430305480957, + "logps/chosen": -130.78244018554688, + "logps/rejected": -139.30172729492188, + "loss": 0.6569, + "rewards/accuracies": 0.59375, + "rewards/chosen": -0.7649413347244263, + "rewards/margins": 0.1254729926586151, + "rewards/rejected": -0.8904143571853638, + "step": 7110 + }, + { + "epoch": 1.2267401791867678, + "grad_norm": 29.442237854003906, + "learning_rate": 7.364549953048537e-08, + "logits/chosen": -2.533996820449829, + "logits/rejected": -2.500549793243408, + "logps/chosen": -136.3294677734375, + "logps/rejected": -143.03616333007812, + "loss": 0.6355, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -0.7312331199645996, + "rewards/margins": 0.1811157912015915, + "rewards/rejected": -0.9123488664627075, + "step": 7120 + }, + { + "epoch": 1.2284631288766368, + "grad_norm": 25.558725357055664, + "learning_rate": 7.355712753076936e-08, + "logits/chosen": -2.4735474586486816, + "logits/rejected": -2.4533772468566895, + "logps/chosen": -129.5522918701172, + "logps/rejected": -144.97238159179688, + "loss": 0.6257, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -0.7247132062911987, + "rewards/margins": 0.1992671936750412, + "rewards/rejected": -0.9239804148674011, + "step": 7130 + }, + { + "epoch": 1.230186078566506, + "grad_norm": 19.20479393005371, + "learning_rate": 7.346866084545236e-08, + "logits/chosen": -2.5496201515197754, + "logits/rejected": -2.5420947074890137, + "logps/chosen": -123.5148696899414, + "logps/rejected": -141.8740692138672, + "loss": 0.6282, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -0.7120001912117004, + "rewards/margins": 0.19337035715579987, + "rewards/rejected": -0.9053705334663391, + "step": 7140 + }, + { + "epoch": 1.231909028256375, + "grad_norm": 33.56254959106445, + "learning_rate": 7.338009983011769e-08, + "logits/chosen": -2.5616440773010254, + "logits/rejected": -2.546262741088867, + "logps/chosen": -144.4009552001953, + "logps/rejected": -155.1837921142578, + "loss": 0.6583, + "rewards/accuracies": 0.6187499761581421, + "rewards/chosen": -0.8757875561714172, + "rewards/margins": 0.13266423344612122, + "rewards/rejected": -1.0084518194198608, + "step": 7150 + }, + { + "epoch": 1.233631977946244, + "grad_norm": 34.856624603271484, + "learning_rate": 7.329144484072778e-08, + "logits/chosen": -2.5242040157318115, + "logits/rejected": -2.4948160648345947, + "logps/chosen": -130.3624267578125, + "logps/rejected": -152.0260009765625, + "loss": 0.6117, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.7619328498840332, + "rewards/margins": 0.23107370734214783, + "rewards/rejected": -0.9930065274238586, + "step": 7160 + }, + { + "epoch": 1.235354927636113, + "grad_norm": 32.98360061645508, + "learning_rate": 7.320269623362282e-08, + "logits/chosen": -2.5037379264831543, + "logits/rejected": -2.4790263175964355, + "logps/chosen": -133.53384399414062, + "logps/rejected": -153.2665252685547, + "loss": 0.6222, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -0.7748786211013794, + "rewards/margins": 0.2275443971157074, + "rewards/rejected": -1.0024230480194092, + "step": 7170 + }, + { + "epoch": 1.237077877325982, + "grad_norm": 22.934099197387695, + "learning_rate": 7.311385436551928e-08, + "logits/chosen": -2.6020641326904297, + "logits/rejected": -2.590449810028076, + "logps/chosen": -131.7655487060547, + "logps/rejected": -147.60829162597656, + "loss": 0.6302, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -0.7536253333091736, + "rewards/margins": 0.18533241748809814, + "rewards/rejected": -0.9389578104019165, + "step": 7180 + }, + { + "epoch": 1.2388008270158513, + "grad_norm": 21.707487106323242, + "learning_rate": 7.302491959350846e-08, + "logits/chosen": -2.4640331268310547, + "logits/rejected": -2.442941904067993, + "logps/chosen": -128.1914825439453, + "logps/rejected": -151.66429138183594, + "loss": 0.6129, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -0.7514085173606873, + "rewards/margins": 0.23872356116771698, + "rewards/rejected": -0.9901320338249207, + "step": 7190 + }, + { + "epoch": 1.2405237767057202, + "grad_norm": 19.718080520629883, + "learning_rate": 7.293589227505511e-08, + "logits/chosen": -2.508129835128784, + "logits/rejected": -2.4907355308532715, + "logps/chosen": -133.34864807128906, + "logps/rejected": -158.2075958251953, + "loss": 0.6127, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.7946994304656982, + "rewards/margins": 0.237708181142807, + "rewards/rejected": -1.032407522201538, + "step": 7200 + }, + { + "epoch": 1.2405237767057202, + "eval_logits/chosen": -2.5876901149749756, + "eval_logits/rejected": -2.582226276397705, + "eval_logps/chosen": -125.13570404052734, + "eval_logps/rejected": -140.89547729492188, + "eval_loss": 0.6558043956756592, + "eval_rewards/accuracies": 0.6038568615913391, + "eval_rewards/chosen": -0.6612022519111633, + "eval_rewards/margins": 0.12025635689496994, + "eval_rewards/rejected": -0.7814586162567139, + "eval_runtime": 383.022, + "eval_samples_per_second": 11.237, + "eval_steps_per_second": 1.405, + "step": 7200 + }, + { + "epoch": 1.2422467263955892, + "grad_norm": 26.754358291625977, + "learning_rate": 7.284677276799593e-08, + "logits/chosen": -2.5734477043151855, + "logits/rejected": -2.5553011894226074, + "logps/chosen": -143.27554321289062, + "logps/rejected": -146.91378784179688, + "loss": 0.6803, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.852822482585907, + "rewards/margins": 0.0901159793138504, + "rewards/rejected": -0.9429384469985962, + "step": 7210 + }, + { + "epoch": 1.2439696760854584, + "grad_norm": 32.42961883544922, + "learning_rate": 7.275756143053821e-08, + "logits/chosen": -2.4831254482269287, + "logits/rejected": -2.45436954498291, + "logps/chosen": -136.29603576660156, + "logps/rejected": -150.64312744140625, + "loss": 0.6357, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -0.8118859529495239, + "rewards/margins": 0.19399450719356537, + "rewards/rejected": -1.00588059425354, + "step": 7220 + }, + { + "epoch": 1.2456926257753274, + "grad_norm": 22.189268112182617, + "learning_rate": 7.266825862125827e-08, + "logits/chosen": -2.486410617828369, + "logits/rejected": -2.4737281799316406, + "logps/chosen": -137.525634765625, + "logps/rejected": -149.06851196289062, + "loss": 0.6451, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -0.8232401609420776, + "rewards/margins": 0.1511302888393402, + "rewards/rejected": -0.9743705987930298, + "step": 7230 + }, + { + "epoch": 1.2474155754651963, + "grad_norm": 21.52886390686035, + "learning_rate": 7.257886469910018e-08, + "logits/chosen": -2.5281074047088623, + "logits/rejected": -2.5180110931396484, + "logps/chosen": -140.8453826904297, + "logps/rejected": -158.28399658203125, + "loss": 0.6219, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.8273015022277832, + "rewards/margins": 0.20986250042915344, + "rewards/rejected": -1.0371639728546143, + "step": 7240 + }, + { + "epoch": 1.2491385251550655, + "grad_norm": 21.729318618774414, + "learning_rate": 7.248938002337412e-08, + "logits/chosen": -2.537574291229248, + "logits/rejected": -2.5159497261047363, + "logps/chosen": -139.05413818359375, + "logps/rejected": -150.28652954101562, + "loss": 0.6384, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.833267867565155, + "rewards/margins": 0.1575065553188324, + "rewards/rejected": -0.9907743334770203, + "step": 7250 + }, + { + "epoch": 1.2508614748449345, + "grad_norm": 26.113866806030273, + "learning_rate": 7.239980495375518e-08, + "logits/chosen": -2.546105146408081, + "logits/rejected": -2.523954153060913, + "logps/chosen": -137.6354522705078, + "logps/rejected": -154.47494506835938, + "loss": 0.6156, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -0.8253766894340515, + "rewards/margins": 0.2103661596775055, + "rewards/rejected": -1.0357427597045898, + "step": 7260 + }, + { + "epoch": 1.2525844245348035, + "grad_norm": 26.12915802001953, + "learning_rate": 7.231013985028168e-08, + "logits/chosen": -2.560234546661377, + "logits/rejected": -2.5334925651550293, + "logps/chosen": -131.84494018554688, + "logps/rejected": -148.89859008789062, + "loss": 0.62, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -0.7643373608589172, + "rewards/margins": 0.2046651542186737, + "rewards/rejected": -0.9690025448799133, + "step": 7270 + }, + { + "epoch": 1.2543073742246726, + "grad_norm": 19.570632934570312, + "learning_rate": 7.222038507335384e-08, + "logits/chosen": -2.5897583961486816, + "logits/rejected": -2.55975079536438, + "logps/chosen": -137.03005981445312, + "logps/rejected": -153.3487091064453, + "loss": 0.6194, + "rewards/accuracies": 0.65625, + "rewards/chosen": -0.7909359931945801, + "rewards/margins": 0.22452524304389954, + "rewards/rejected": -1.0154612064361572, + "step": 7280 + }, + { + "epoch": 1.2560303239145416, + "grad_norm": 26.04621696472168, + "learning_rate": 7.213054098373232e-08, + "logits/chosen": -2.451444625854492, + "logits/rejected": -2.4375545978546143, + "logps/chosen": -141.99853515625, + "logps/rejected": -157.99383544921875, + "loss": 0.6407, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -0.8617357015609741, + "rewards/margins": 0.16678765416145325, + "rewards/rejected": -1.028523325920105, + "step": 7290 + }, + { + "epoch": 1.2577532736044108, + "grad_norm": 22.285764694213867, + "learning_rate": 7.204060794253679e-08, + "logits/chosen": -2.420621395111084, + "logits/rejected": -2.403648853302002, + "logps/chosen": -132.68133544921875, + "logps/rejected": -153.64938354492188, + "loss": 0.634, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -0.8204687237739563, + "rewards/margins": 0.19447723031044006, + "rewards/rejected": -1.0149458646774292, + "step": 7300 + }, + { + "epoch": 1.2594762232942798, + "grad_norm": 22.316200256347656, + "learning_rate": 7.195058631124443e-08, + "logits/chosen": -2.5474612712860107, + "logits/rejected": -2.527472972869873, + "logps/chosen": -145.7023468017578, + "logps/rejected": -164.7689666748047, + "loss": 0.6177, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -0.9006555676460266, + "rewards/margins": 0.23807115852832794, + "rewards/rejected": -1.138726830482483, + "step": 7310 + }, + { + "epoch": 1.2611991729841487, + "grad_norm": 23.586435317993164, + "learning_rate": 7.186047645168849e-08, + "logits/chosen": -2.54943585395813, + "logits/rejected": -2.5296919345855713, + "logps/chosen": -138.24465942382812, + "logps/rejected": -153.2172393798828, + "loss": 0.6292, + "rewards/accuracies": 0.59375, + "rewards/chosen": -0.8254194259643555, + "rewards/margins": 0.20264151692390442, + "rewards/rejected": -1.0280609130859375, + "step": 7320 + }, + { + "epoch": 1.262922122674018, + "grad_norm": 36.1264533996582, + "learning_rate": 7.177027872605686e-08, + "logits/chosen": -2.4627273082733154, + "logits/rejected": -2.4443767070770264, + "logps/chosen": -139.52914428710938, + "logps/rejected": -166.06155395507812, + "loss": 0.5893, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -0.8535804748535156, + "rewards/margins": 0.3056808114051819, + "rewards/rejected": -1.1592612266540527, + "step": 7330 + }, + { + "epoch": 1.264645072363887, + "grad_norm": 27.05265998840332, + "learning_rate": 7.167999349689062e-08, + "logits/chosen": -2.525515079498291, + "logits/rejected": -2.502408742904663, + "logps/chosen": -147.11248779296875, + "logps/rejected": -160.80465698242188, + "loss": 0.6513, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -0.9110174179077148, + "rewards/margins": 0.16634492576122284, + "rewards/rejected": -1.0773624181747437, + "step": 7340 + }, + { + "epoch": 1.266368022053756, + "grad_norm": 29.149703979492188, + "learning_rate": 7.158962112708247e-08, + "logits/chosen": -2.575409412384033, + "logits/rejected": -2.553063154220581, + "logps/chosen": -140.3431854248047, + "logps/rejected": -154.044189453125, + "loss": 0.6291, + "rewards/accuracies": 0.606249988079071, + "rewards/chosen": -0.819327175617218, + "rewards/margins": 0.21409860253334045, + "rewards/rejected": -1.0334258079528809, + "step": 7350 + }, + { + "epoch": 1.268090971743625, + "grad_norm": 21.219430923461914, + "learning_rate": 7.14991619798755e-08, + "logits/chosen": -2.4919114112854004, + "logits/rejected": -2.477019786834717, + "logps/chosen": -141.4704132080078, + "logps/rejected": -157.49951171875, + "loss": 0.6308, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -0.8595463633537292, + "rewards/margins": 0.20486697554588318, + "rewards/rejected": -1.0644134283065796, + "step": 7360 + }, + { + "epoch": 1.269813921433494, + "grad_norm": 32.99021530151367, + "learning_rate": 7.140861641886148e-08, + "logits/chosen": -2.4564788341522217, + "logits/rejected": -2.439401626586914, + "logps/chosen": -141.84283447265625, + "logps/rejected": -154.24302673339844, + "loss": 0.6476, + "rewards/accuracies": 0.606249988079071, + "rewards/chosen": -0.8768693804740906, + "rewards/margins": 0.16116423904895782, + "rewards/rejected": -1.0380337238311768, + "step": 7370 + }, + { + "epoch": 1.2715368711233632, + "grad_norm": 22.127519607543945, + "learning_rate": 7.131798480797957e-08, + "logits/chosen": -2.5054032802581787, + "logits/rejected": -2.488107204437256, + "logps/chosen": -138.72915649414062, + "logps/rejected": -162.11215209960938, + "loss": 0.6188, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -0.852269172668457, + "rewards/margins": 0.23749974370002747, + "rewards/rejected": -1.0897690057754517, + "step": 7380 + }, + { + "epoch": 1.2732598208132322, + "grad_norm": 38.59953689575195, + "learning_rate": 7.12272675115148e-08, + "logits/chosen": -2.4843626022338867, + "logits/rejected": -2.470242738723755, + "logps/chosen": -137.43231201171875, + "logps/rejected": -158.0469970703125, + "loss": 0.6284, + "rewards/accuracies": 0.65625, + "rewards/chosen": -0.8447599411010742, + "rewards/margins": 0.21666069328784943, + "rewards/rejected": -1.0614207983016968, + "step": 7390 + }, + { + "epoch": 1.2749827705031014, + "grad_norm": 22.354713439941406, + "learning_rate": 7.113646489409654e-08, + "logits/chosen": -2.479954957962036, + "logits/rejected": -2.449371337890625, + "logps/chosen": -150.95327758789062, + "logps/rejected": -165.73294067382812, + "loss": 0.6249, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -0.9459403157234192, + "rewards/margins": 0.2137012928724289, + "rewards/rejected": -1.1596416234970093, + "step": 7400 + }, + { + "epoch": 1.2767057201929704, + "grad_norm": 24.540973663330078, + "learning_rate": 7.104557732069722e-08, + "logits/chosen": -2.4785265922546387, + "logits/rejected": -2.446809768676758, + "logps/chosen": -142.2882537841797, + "logps/rejected": -164.82403564453125, + "loss": 0.6116, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -0.8778212666511536, + "rewards/margins": 0.25136417150497437, + "rewards/rejected": -1.129185438156128, + "step": 7410 + }, + { + "epoch": 1.2784286698828393, + "grad_norm": 23.970643997192383, + "learning_rate": 7.09546051566306e-08, + "logits/chosen": -2.4319984912872314, + "logits/rejected": -2.4142327308654785, + "logps/chosen": -147.84335327148438, + "logps/rejected": -164.019775390625, + "loss": 0.6262, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -0.9034894704818726, + "rewards/margins": 0.20930960774421692, + "rewards/rejected": -1.1127991676330566, + "step": 7420 + }, + { + "epoch": 1.2801516195727085, + "grad_norm": 28.692846298217773, + "learning_rate": 7.086354876755058e-08, + "logits/chosen": -2.4339330196380615, + "logits/rejected": -2.4105517864227295, + "logps/chosen": -156.17025756835938, + "logps/rejected": -180.38577270507812, + "loss": 0.6097, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -1.0276180505752563, + "rewards/margins": 0.2644575238227844, + "rewards/rejected": -1.292075514793396, + "step": 7430 + }, + { + "epoch": 1.2818745692625775, + "grad_norm": 27.577713012695312, + "learning_rate": 7.07724085194495e-08, + "logits/chosen": -2.508514881134033, + "logits/rejected": -2.4860053062438965, + "logps/chosen": -162.51748657226562, + "logps/rejected": -180.8822479248047, + "loss": 0.6349, + "rewards/accuracies": 0.65625, + "rewards/chosen": -1.0603501796722412, + "rewards/margins": 0.22516348958015442, + "rewards/rejected": -1.2855136394500732, + "step": 7440 + }, + { + "epoch": 1.2835975189524467, + "grad_norm": 33.16237258911133, + "learning_rate": 7.068118477865677e-08, + "logits/chosen": -2.582409620285034, + "logits/rejected": -2.559049606323242, + "logps/chosen": -156.26156616210938, + "logps/rejected": -167.25140380859375, + "loss": 0.6504, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -0.9874914884567261, + "rewards/margins": 0.16984400153160095, + "rewards/rejected": -1.1573354005813599, + "step": 7450 + }, + { + "epoch": 1.2853204686423156, + "grad_norm": 25.894336700439453, + "learning_rate": 7.058987791183744e-08, + "logits/chosen": -2.451228618621826, + "logits/rejected": -2.448843240737915, + "logps/chosen": -147.57638549804688, + "logps/rejected": -170.20025634765625, + "loss": 0.632, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -0.965354323387146, + "rewards/margins": 0.21499767899513245, + "rewards/rejected": -1.180351972579956, + "step": 7460 + }, + { + "epoch": 1.2870434183321846, + "grad_norm": 26.20570945739746, + "learning_rate": 7.049848828599064e-08, + "logits/chosen": -2.514191150665283, + "logits/rejected": -2.5005524158477783, + "logps/chosen": -152.73623657226562, + "logps/rejected": -164.35342407226562, + "loss": 0.6567, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -0.9762645959854126, + "rewards/margins": 0.16078788042068481, + "rewards/rejected": -1.1370524168014526, + "step": 7470 + }, + { + "epoch": 1.2887663680220538, + "grad_norm": 23.704248428344727, + "learning_rate": 7.040701626844819e-08, + "logits/chosen": -2.465456485748291, + "logits/rejected": -2.442073345184326, + "logps/chosen": -149.51150512695312, + "logps/rejected": -159.86837768554688, + "loss": 0.6446, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -0.9404380917549133, + "rewards/margins": 0.17175881564617157, + "rewards/rejected": -1.112196922302246, + "step": 7480 + }, + { + "epoch": 1.2904893177119228, + "grad_norm": 26.433223724365234, + "learning_rate": 7.031546222687296e-08, + "logits/chosen": -2.4424567222595215, + "logits/rejected": -2.4317264556884766, + "logps/chosen": -154.74525451660156, + "logps/rejected": -174.90280151367188, + "loss": 0.6298, + "rewards/accuracies": 0.6187499761581421, + "rewards/chosen": -1.0154364109039307, + "rewards/margins": 0.20561465620994568, + "rewards/rejected": -1.2210509777069092, + "step": 7490 + }, + { + "epoch": 1.292212267401792, + "grad_norm": 34.223243713378906, + "learning_rate": 7.022382652925766e-08, + "logits/chosen": -2.464292049407959, + "logits/rejected": -2.4453139305114746, + "logps/chosen": -152.37490844726562, + "logps/rejected": -168.61769104003906, + "loss": 0.6606, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -1.0072271823883057, + "rewards/margins": 0.16684751212596893, + "rewards/rejected": -1.174074649810791, + "step": 7500 + }, + { + "epoch": 1.293935217091661, + "grad_norm": 32.43608474731445, + "learning_rate": 7.01321095439231e-08, + "logits/chosen": -2.471198558807373, + "logits/rejected": -2.453827381134033, + "logps/chosen": -154.67567443847656, + "logps/rejected": -164.9767303466797, + "loss": 0.6459, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -0.9621097445487976, + "rewards/margins": 0.1783999502658844, + "rewards/rejected": -1.1405094861984253, + "step": 7510 + }, + { + "epoch": 1.29565816678153, + "grad_norm": 29.828994750976562, + "learning_rate": 7.004031163951686e-08, + "logits/chosen": -2.478649377822876, + "logits/rejected": -2.4622559547424316, + "logps/chosen": -145.52989196777344, + "logps/rejected": -162.56576538085938, + "loss": 0.6354, + "rewards/accuracies": 0.606249988079071, + "rewards/chosen": -0.910129725933075, + "rewards/margins": 0.1902073472738266, + "rewards/rejected": -1.100337028503418, + "step": 7520 + }, + { + "epoch": 1.297381116471399, + "grad_norm": 25.16864585876465, + "learning_rate": 6.994843318501175e-08, + "logits/chosen": -2.434582471847534, + "logits/rejected": -2.4288063049316406, + "logps/chosen": -144.28469848632812, + "logps/rejected": -161.70635986328125, + "loss": 0.6463, + "rewards/accuracies": 0.6187499761581421, + "rewards/chosen": -0.9157862663269043, + "rewards/margins": 0.17825372517108917, + "rewards/rejected": -1.0940399169921875, + "step": 7530 + }, + { + "epoch": 1.299104066161268, + "grad_norm": 17.154449462890625, + "learning_rate": 6.985647454970436e-08, + "logits/chosen": -2.5492055416107178, + "logits/rejected": -2.5423622131347656, + "logps/chosen": -130.44430541992188, + "logps/rejected": -154.4150848388672, + "loss": 0.6089, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -0.778977632522583, + "rewards/margins": 0.24291332066059113, + "rewards/rejected": -1.0218908786773682, + "step": 7540 + }, + { + "epoch": 1.3008270158511372, + "grad_norm": 23.36393165588379, + "learning_rate": 6.976443610321355e-08, + "logits/chosen": -2.4938864707946777, + "logits/rejected": -2.47977876663208, + "logps/chosen": -135.61158752441406, + "logps/rejected": -156.0998077392578, + "loss": 0.62, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -0.824744701385498, + "rewards/margins": 0.22780942916870117, + "rewards/rejected": -1.0525541305541992, + "step": 7550 + }, + { + "epoch": 1.3025499655410062, + "grad_norm": 23.027864456176758, + "learning_rate": 6.9672318215479e-08, + "logits/chosen": -2.5449604988098145, + "logits/rejected": -2.527798652648926, + "logps/chosen": -131.2892608642578, + "logps/rejected": -160.10562133789062, + "loss": 0.598, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -0.7776525020599365, + "rewards/margins": 0.2734105587005615, + "rewards/rejected": -1.0510631799697876, + "step": 7560 + }, + { + "epoch": 1.3042729152308752, + "grad_norm": 21.192401885986328, + "learning_rate": 6.958012125675961e-08, + "logits/chosen": -2.5487101078033447, + "logits/rejected": -2.531736135482788, + "logps/chosen": -147.08360290527344, + "logps/rejected": -165.15524291992188, + "loss": 0.6196, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.9009296298027039, + "rewards/margins": 0.23346951603889465, + "rewards/rejected": -1.134399175643921, + "step": 7570 + }, + { + "epoch": 1.3059958649207444, + "grad_norm": 27.005096435546875, + "learning_rate": 6.948784559763221e-08, + "logits/chosen": -2.5103249549865723, + "logits/rejected": -2.494823932647705, + "logps/chosen": -144.07765197753906, + "logps/rejected": -160.0559844970703, + "loss": 0.6343, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.868603527545929, + "rewards/margins": 0.19318082928657532, + "rewards/rejected": -1.0617843866348267, + "step": 7580 + }, + { + "epoch": 1.3077188146106133, + "grad_norm": 30.346166610717773, + "learning_rate": 6.93954916089899e-08, + "logits/chosen": -2.519892454147339, + "logits/rejected": -2.4853830337524414, + "logps/chosen": -153.8564910888672, + "logps/rejected": -169.29183959960938, + "loss": 0.6149, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.9491060376167297, + "rewards/margins": 0.24793191254138947, + "rewards/rejected": -1.197037935256958, + "step": 7590 + }, + { + "epoch": 1.3094417643004825, + "grad_norm": 31.32855224609375, + "learning_rate": 6.930305966204059e-08, + "logits/chosen": -2.4879536628723145, + "logits/rejected": -2.4646151065826416, + "logps/chosen": -147.8095245361328, + "logps/rejected": -157.54727172851562, + "loss": 0.6531, + "rewards/accuracies": 0.59375, + "rewards/chosen": -0.9458245038986206, + "rewards/margins": 0.15245869755744934, + "rewards/rejected": -1.098283290863037, + "step": 7600 + }, + { + "epoch": 1.3094417643004825, + "eval_logits/chosen": -2.556385040283203, + "eval_logits/rejected": -2.550178289413452, + "eval_logps/chosen": -133.6133270263672, + "eval_logps/rejected": -150.78623962402344, + "eval_loss": 0.6534062027931213, + "eval_rewards/accuracies": 0.6040892004966736, + "eval_rewards/chosen": -0.7459785342216492, + "eval_rewards/margins": 0.13438780605793, + "eval_rewards/rejected": -0.880366325378418, + "eval_runtime": 383.0675, + "eval_samples_per_second": 11.236, + "eval_steps_per_second": 1.404, + "step": 7600 + }, + { + "epoch": 1.3111647139903515, + "grad_norm": 24.08529281616211, + "learning_rate": 6.921055012830563e-08, + "logits/chosen": -2.4477341175079346, + "logits/rejected": -2.423722743988037, + "logps/chosen": -145.34474182128906, + "logps/rejected": -161.76712036132812, + "loss": 0.6323, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.9105755090713501, + "rewards/margins": 0.21531391143798828, + "rewards/rejected": -1.125889539718628, + "step": 7610 + }, + { + "epoch": 1.3128876636802205, + "grad_norm": 19.797929763793945, + "learning_rate": 6.911796337961813e-08, + "logits/chosen": -2.4861130714416504, + "logits/rejected": -2.4657671451568604, + "logps/chosen": -142.93849182128906, + "logps/rejected": -161.22935485839844, + "loss": 0.6132, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.8410897254943848, + "rewards/margins": 0.23532815277576447, + "rewards/rejected": -1.0764179229736328, + "step": 7620 + }, + { + "epoch": 1.3146106133700897, + "grad_norm": 27.883426666259766, + "learning_rate": 6.902529978812159e-08, + "logits/chosen": -2.4483742713928223, + "logits/rejected": -2.4535410404205322, + "logps/chosen": -137.55007934570312, + "logps/rejected": -162.91024780273438, + "loss": 0.6167, + "rewards/accuracies": 0.71875, + "rewards/chosen": -0.8679767847061157, + "rewards/margins": 0.2256569117307663, + "rewards/rejected": -1.093633770942688, + "step": 7630 + }, + { + "epoch": 1.3163335630599586, + "grad_norm": 32.171966552734375, + "learning_rate": 6.893255972626838e-08, + "logits/chosen": -2.44758939743042, + "logits/rejected": -2.425205707550049, + "logps/chosen": -154.0350341796875, + "logps/rejected": -170.94781494140625, + "loss": 0.6284, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -0.955964207649231, + "rewards/margins": 0.23719045519828796, + "rewards/rejected": -1.1931545734405518, + "step": 7640 + }, + { + "epoch": 1.3180565127498278, + "grad_norm": 21.268714904785156, + "learning_rate": 6.883974356681823e-08, + "logits/chosen": -2.5042102336883545, + "logits/rejected": -2.486968517303467, + "logps/chosen": -161.19512939453125, + "logps/rejected": -175.97557067871094, + "loss": 0.6452, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": -1.0189523696899414, + "rewards/margins": 0.1943272054195404, + "rewards/rejected": -1.2132797241210938, + "step": 7650 + }, + { + "epoch": 1.3197794624396968, + "grad_norm": 20.272716522216797, + "learning_rate": 6.874685168283675e-08, + "logits/chosen": -2.512941837310791, + "logits/rejected": -2.487058639526367, + "logps/chosen": -150.97122192382812, + "logps/rejected": -172.14968872070312, + "loss": 0.611, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.9367518424987793, + "rewards/margins": 0.24415257573127747, + "rewards/rejected": -1.1809046268463135, + "step": 7660 + }, + { + "epoch": 1.3215024121295658, + "grad_norm": 29.262372970581055, + "learning_rate": 6.865388444769388e-08, + "logits/chosen": -2.465951919555664, + "logits/rejected": -2.447601795196533, + "logps/chosen": -148.7430419921875, + "logps/rejected": -161.8044891357422, + "loss": 0.6379, + "rewards/accuracies": 0.606249988079071, + "rewards/chosen": -0.90485680103302, + "rewards/margins": 0.1881795972585678, + "rewards/rejected": -1.093036413192749, + "step": 7670 + }, + { + "epoch": 1.323225361819435, + "grad_norm": 23.13471031188965, + "learning_rate": 6.856084223506247e-08, + "logits/chosen": -2.500370740890503, + "logits/rejected": -2.484502077102661, + "logps/chosen": -143.19085693359375, + "logps/rejected": -163.03858947753906, + "loss": 0.6177, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -0.9128525853157043, + "rewards/margins": 0.22799701988697052, + "rewards/rejected": -1.1408497095108032, + "step": 7680 + }, + { + "epoch": 1.324948311509304, + "grad_norm": 24.28408432006836, + "learning_rate": 6.84677254189167e-08, + "logits/chosen": -2.586789131164551, + "logits/rejected": -2.5469651222229004, + "logps/chosen": -140.514892578125, + "logps/rejected": -157.67715454101562, + "loss": 0.6139, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.8414154052734375, + "rewards/margins": 0.2492334395647049, + "rewards/rejected": -1.090648889541626, + "step": 7690 + }, + { + "epoch": 1.3266712611991731, + "grad_norm": 20.994827270507812, + "learning_rate": 6.837453437353064e-08, + "logits/chosen": -2.4863336086273193, + "logits/rejected": -2.456991672515869, + "logps/chosen": -143.09527587890625, + "logps/rejected": -162.80410766601562, + "loss": 0.6266, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -0.884503960609436, + "rewards/margins": 0.2061157524585724, + "rewards/rejected": -1.090619683265686, + "step": 7700 + }, + { + "epoch": 1.328394210889042, + "grad_norm": 28.234207153320312, + "learning_rate": 6.82812694734767e-08, + "logits/chosen": -2.493734121322632, + "logits/rejected": -2.4809954166412354, + "logps/chosen": -149.52813720703125, + "logps/rejected": -163.19261169433594, + "loss": 0.6498, + "rewards/accuracies": 0.6187499761581421, + "rewards/chosen": -0.9637618064880371, + "rewards/margins": 0.1601286083459854, + "rewards/rejected": -1.1238903999328613, + "step": 7710 + }, + { + "epoch": 1.330117160578911, + "grad_norm": 26.1023006439209, + "learning_rate": 6.818793109362416e-08, + "logits/chosen": -2.5035171508789062, + "logits/rejected": -2.4748847484588623, + "logps/chosen": -146.76425170898438, + "logps/rejected": -160.68130493164062, + "loss": 0.621, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.8933634757995605, + "rewards/margins": 0.2182648628950119, + "rewards/rejected": -1.1116282939910889, + "step": 7720 + }, + { + "epoch": 1.33184011026878, + "grad_norm": 32.87727737426758, + "learning_rate": 6.80945196091376e-08, + "logits/chosen": -2.438372850418091, + "logits/rejected": -2.416724681854248, + "logps/chosen": -134.09129333496094, + "logps/rejected": -156.75234985351562, + "loss": 0.5997, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -0.7908092141151428, + "rewards/margins": 0.26274845004081726, + "rewards/rejected": -1.0535576343536377, + "step": 7730 + }, + { + "epoch": 1.3335630599586492, + "grad_norm": 22.23272705078125, + "learning_rate": 6.800103539547548e-08, + "logits/chosen": -2.4662704467773438, + "logits/rejected": -2.450282096862793, + "logps/chosen": -145.58279418945312, + "logps/rejected": -168.5486297607422, + "loss": 0.622, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.9237093925476074, + "rewards/margins": 0.23403067886829376, + "rewards/rejected": -1.1577401161193848, + "step": 7740 + }, + { + "epoch": 1.3352860096485184, + "grad_norm": 20.547780990600586, + "learning_rate": 6.790747882838859e-08, + "logits/chosen": -2.4921231269836426, + "logits/rejected": -2.4648630619049072, + "logps/chosen": -151.70297241210938, + "logps/rejected": -171.81393432617188, + "loss": 0.6228, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -0.9648507833480835, + "rewards/margins": 0.24369268119335175, + "rewards/rejected": -1.2085435390472412, + "step": 7750 + }, + { + "epoch": 1.3370089593383874, + "grad_norm": 34.338104248046875, + "learning_rate": 6.781385028391851e-08, + "logits/chosen": -2.384896755218506, + "logits/rejected": -2.3702826499938965, + "logps/chosen": -136.65414428710938, + "logps/rejected": -162.7095184326172, + "loss": 0.6044, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -0.8515266180038452, + "rewards/margins": 0.2675195038318634, + "rewards/rejected": -1.1190460920333862, + "step": 7760 + }, + { + "epoch": 1.3387319090282563, + "grad_norm": 25.451663970947266, + "learning_rate": 6.772015013839616e-08, + "logits/chosen": -2.4470276832580566, + "logits/rejected": -2.428715944290161, + "logps/chosen": -143.4765625, + "logps/rejected": -165.12799072265625, + "loss": 0.6191, + "rewards/accuracies": 0.65625, + "rewards/chosen": -0.9022136926651001, + "rewards/margins": 0.23084497451782227, + "rewards/rejected": -1.1330586671829224, + "step": 7770 + }, + { + "epoch": 1.3404548587181253, + "grad_norm": 25.063844680786133, + "learning_rate": 6.762637876844021e-08, + "logits/chosen": -2.529904365539551, + "logits/rejected": -2.5203545093536377, + "logps/chosen": -145.97256469726562, + "logps/rejected": -167.47885131835938, + "loss": 0.6327, + "rewards/accuracies": 0.59375, + "rewards/chosen": -0.9347503781318665, + "rewards/margins": 0.21358363330364227, + "rewards/rejected": -1.1483341455459595, + "step": 7780 + }, + { + "epoch": 1.3421778084079945, + "grad_norm": 31.19562530517578, + "learning_rate": 6.753253655095565e-08, + "logits/chosen": -2.5071768760681152, + "logits/rejected": -2.4931302070617676, + "logps/chosen": -145.25839233398438, + "logps/rejected": -168.6995849609375, + "loss": 0.6203, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.9317231178283691, + "rewards/margins": 0.24524815380573273, + "rewards/rejected": -1.176971197128296, + "step": 7790 + }, + { + "epoch": 1.3439007580978635, + "grad_norm": 21.104005813598633, + "learning_rate": 6.743862386313219e-08, + "logits/chosen": -2.512084722518921, + "logits/rejected": -2.501608371734619, + "logps/chosen": -147.72988891601562, + "logps/rejected": -174.69314575195312, + "loss": 0.6068, + "rewards/accuracies": 0.71875, + "rewards/chosen": -0.9235385656356812, + "rewards/margins": 0.2817572057247162, + "rewards/rejected": -1.2052958011627197, + "step": 7800 + }, + { + "epoch": 1.3456237077877327, + "grad_norm": 29.805986404418945, + "learning_rate": 6.734464108244285e-08, + "logits/chosen": -2.531437635421753, + "logits/rejected": -2.5037777423858643, + "logps/chosen": -148.46267700195312, + "logps/rejected": -169.79966735839844, + "loss": 0.6042, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.8897361755371094, + "rewards/margins": 0.2666037976741791, + "rewards/rejected": -1.1563400030136108, + "step": 7810 + }, + { + "epoch": 1.3473466574776016, + "grad_norm": 25.536258697509766, + "learning_rate": 6.725058858664234e-08, + "logits/chosen": -2.5017189979553223, + "logits/rejected": -2.4743571281433105, + "logps/chosen": -144.98760986328125, + "logps/rejected": -172.6921844482422, + "loss": 0.5926, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -0.8739396929740906, + "rewards/margins": 0.31180593371391296, + "rewards/rejected": -1.1857458353042603, + "step": 7820 + }, + { + "epoch": 1.3490696071674706, + "grad_norm": 29.31256103515625, + "learning_rate": 6.715646675376557e-08, + "logits/chosen": -2.4239916801452637, + "logits/rejected": -2.411015272140503, + "logps/chosen": -151.56417846679688, + "logps/rejected": -178.06365966796875, + "loss": 0.6215, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -1.0077168941497803, + "rewards/margins": 0.24843649566173553, + "rewards/rejected": -1.2561534643173218, + "step": 7830 + }, + { + "epoch": 1.3507925568573398, + "grad_norm": 23.53630256652832, + "learning_rate": 6.70622759621262e-08, + "logits/chosen": -2.4109740257263184, + "logits/rejected": -2.394785165786743, + "logps/chosen": -150.35752868652344, + "logps/rejected": -170.80088806152344, + "loss": 0.6364, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -0.9575635194778442, + "rewards/margins": 0.21887817978858948, + "rewards/rejected": -1.1764415502548218, + "step": 7840 + }, + { + "epoch": 1.3525155065472088, + "grad_norm": 20.69610595703125, + "learning_rate": 6.6968016590315e-08, + "logits/chosen": -2.4047062397003174, + "logits/rejected": -2.372786521911621, + "logps/chosen": -155.9188232421875, + "logps/rejected": -165.7627716064453, + "loss": 0.6492, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -0.9968074560165405, + "rewards/margins": 0.1855669915676117, + "rewards/rejected": -1.1823744773864746, + "step": 7850 + }, + { + "epoch": 1.354238456237078, + "grad_norm": 26.409324645996094, + "learning_rate": 6.687368901719843e-08, + "logits/chosen": -2.4540700912475586, + "logits/rejected": -2.4210312366485596, + "logps/chosen": -146.4640350341797, + "logps/rejected": -176.3026123046875, + "loss": 0.5928, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.9319847822189331, + "rewards/margins": 0.3151303827762604, + "rewards/rejected": -1.2471152544021606, + "step": 7860 + }, + { + "epoch": 1.355961405926947, + "grad_norm": 28.181825637817383, + "learning_rate": 6.677929362191708e-08, + "logits/chosen": -2.460646629333496, + "logits/rejected": -2.4472787380218506, + "logps/chosen": -156.83065795898438, + "logps/rejected": -175.25631713867188, + "loss": 0.6348, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -1.000558614730835, + "rewards/margins": 0.2184421271085739, + "rewards/rejected": -1.2190005779266357, + "step": 7870 + }, + { + "epoch": 1.3576843556168159, + "grad_norm": 34.54668045043945, + "learning_rate": 6.668483078388411e-08, + "logits/chosen": -2.4971394538879395, + "logits/rejected": -2.4821231365203857, + "logps/chosen": -148.3890380859375, + "logps/rejected": -163.4290771484375, + "loss": 0.6518, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.9440007209777832, + "rewards/margins": 0.1740240752696991, + "rewards/rejected": -1.1180247068405151, + "step": 7880 + }, + { + "epoch": 1.359407305306685, + "grad_norm": 30.347705841064453, + "learning_rate": 6.659030088278378e-08, + "logits/chosen": -2.4705967903137207, + "logits/rejected": -2.451388359069824, + "logps/chosen": -139.71258544921875, + "logps/rejected": -162.09963989257812, + "loss": 0.6221, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.8485860824584961, + "rewards/margins": 0.22423677146434784, + "rewards/rejected": -1.0728228092193604, + "step": 7890 + }, + { + "epoch": 1.361130254996554, + "grad_norm": 26.510684967041016, + "learning_rate": 6.649570429856992e-08, + "logits/chosen": -2.4827654361724854, + "logits/rejected": -2.4759905338287354, + "logps/chosen": -142.5802764892578, + "logps/rejected": -162.55567932128906, + "loss": 0.6289, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -0.8945484161376953, + "rewards/margins": 0.20854821801185608, + "rewards/rejected": -1.103096604347229, + "step": 7900 + }, + { + "epoch": 1.3628532046864232, + "grad_norm": 23.563236236572266, + "learning_rate": 6.640104141146439e-08, + "logits/chosen": -2.4833462238311768, + "logits/rejected": -2.463135242462158, + "logps/chosen": -148.6671142578125, + "logps/rejected": -164.3062286376953, + "loss": 0.6431, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.9139005541801453, + "rewards/margins": 0.19988103210926056, + "rewards/rejected": -1.1137816905975342, + "step": 7910 + }, + { + "epoch": 1.3645761543762922, + "grad_norm": 21.265867233276367, + "learning_rate": 6.630631260195548e-08, + "logits/chosen": -2.463007688522339, + "logits/rejected": -2.4445767402648926, + "logps/chosen": -143.3325653076172, + "logps/rejected": -161.53256225585938, + "loss": 0.6136, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -0.8710179328918457, + "rewards/margins": 0.23537349700927734, + "rewards/rejected": -1.106391191482544, + "step": 7920 + }, + { + "epoch": 1.3662991040661612, + "grad_norm": 27.329938888549805, + "learning_rate": 6.621151825079657e-08, + "logits/chosen": -2.506049633026123, + "logits/rejected": -2.484340190887451, + "logps/chosen": -150.452880859375, + "logps/rejected": -163.8821258544922, + "loss": 0.6313, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": -0.9138700366020203, + "rewards/margins": 0.2064877450466156, + "rewards/rejected": -1.1203577518463135, + "step": 7930 + }, + { + "epoch": 1.3680220537560304, + "grad_norm": 34.85219955444336, + "learning_rate": 6.611665873900434e-08, + "logits/chosen": -2.4367105960845947, + "logits/rejected": -2.4192934036254883, + "logps/chosen": -151.02890014648438, + "logps/rejected": -175.16510009765625, + "loss": 0.6079, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.9187140464782715, + "rewards/margins": 0.28845953941345215, + "rewards/rejected": -1.2071735858917236, + "step": 7940 + }, + { + "epoch": 1.3697450034458993, + "grad_norm": 24.84377670288086, + "learning_rate": 6.602173444785747e-08, + "logits/chosen": -2.4143567085266113, + "logits/rejected": -2.4058680534362793, + "logps/chosen": -139.19375610351562, + "logps/rejected": -167.15652465820312, + "loss": 0.6078, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -0.8710354566574097, + "rewards/margins": 0.2791138291358948, + "rewards/rejected": -1.1501493453979492, + "step": 7950 + }, + { + "epoch": 1.3714679531357685, + "grad_norm": 21.30821990966797, + "learning_rate": 6.5926745758895e-08, + "logits/chosen": -2.4210612773895264, + "logits/rejected": -2.398871898651123, + "logps/chosen": -142.59088134765625, + "logps/rejected": -163.7798614501953, + "loss": 0.6309, + "rewards/accuracies": 0.65625, + "rewards/chosen": -0.9171046018600464, + "rewards/margins": 0.20536521077156067, + "rewards/rejected": -1.1224697828292847, + "step": 7960 + }, + { + "epoch": 1.3731909028256375, + "grad_norm": 21.328529357910156, + "learning_rate": 6.583169305391479e-08, + "logits/chosen": -2.4894626140594482, + "logits/rejected": -2.469831705093384, + "logps/chosen": -148.52047729492188, + "logps/rejected": -162.56149291992188, + "loss": 0.64, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.9207156896591187, + "rewards/margins": 0.18888339400291443, + "rewards/rejected": -1.109598994255066, + "step": 7970 + }, + { + "epoch": 1.3749138525155065, + "grad_norm": 27.226303100585938, + "learning_rate": 6.5736576714972e-08, + "logits/chosen": -2.518310070037842, + "logits/rejected": -2.5061028003692627, + "logps/chosen": -148.75880432128906, + "logps/rejected": -169.51956176757812, + "loss": 0.6238, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -0.9074954986572266, + "rewards/margins": 0.23433642089366913, + "rewards/rejected": -1.1418317556381226, + "step": 7980 + }, + { + "epoch": 1.3766368022053757, + "grad_norm": 23.61194610595703, + "learning_rate": 6.564139712437761e-08, + "logits/chosen": -2.51324200630188, + "logits/rejected": -2.495260715484619, + "logps/chosen": -144.06536865234375, + "logps/rejected": -171.9273681640625, + "loss": 0.6009, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.9267575144767761, + "rewards/margins": 0.2783433496952057, + "rewards/rejected": -1.2051007747650146, + "step": 7990 + }, + { + "epoch": 1.3783597518952446, + "grad_norm": 30.414278030395508, + "learning_rate": 6.554615466469677e-08, + "logits/chosen": -2.413956880569458, + "logits/rejected": -2.3987700939178467, + "logps/chosen": -148.2495574951172, + "logps/rejected": -178.20701599121094, + "loss": 0.5995, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.9225319027900696, + "rewards/margins": 0.308528870344162, + "rewards/rejected": -1.2310607433319092, + "step": 8000 + }, + { + "epoch": 1.3783597518952446, + "eval_logits/chosen": -2.526700496673584, + "eval_logits/rejected": -2.5195157527923584, + "eval_logps/chosen": -140.2942352294922, + "eval_logps/rejected": -158.2948455810547, + "eval_loss": 0.6527594923973083, + "eval_rewards/accuracies": 0.6006041169166565, + "eval_rewards/chosen": -0.8127875924110413, + "eval_rewards/margins": 0.14266452193260193, + "eval_rewards/rejected": -0.9554521441459656, + "eval_runtime": 382.8699, + "eval_samples_per_second": 11.241, + "eval_steps_per_second": 1.405, + "step": 8000 + }, + { + "epoch": 1.3800827015851138, + "grad_norm": 30.691783905029297, + "learning_rate": 6.545084971874738e-08, + "logits/chosen": -2.4314675331115723, + "logits/rejected": -2.4045021533966064, + "logps/chosen": -157.57028198242188, + "logps/rejected": -173.6315155029297, + "loss": 0.6301, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.9951564073562622, + "rewards/margins": 0.22977761924266815, + "rewards/rejected": -1.2249339818954468, + "step": 8010 + }, + { + "epoch": 1.3818056512749828, + "grad_norm": 25.586700439453125, + "learning_rate": 6.535548266959845e-08, + "logits/chosen": -2.4431333541870117, + "logits/rejected": -2.4124550819396973, + "logps/chosen": -168.57510375976562, + "logps/rejected": -186.40036010742188, + "loss": 0.6201, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.0784189701080322, + "rewards/margins": 0.24669373035430908, + "rewards/rejected": -1.3251125812530518, + "step": 8020 + }, + { + "epoch": 1.3835286009648518, + "grad_norm": 29.118562698364258, + "learning_rate": 6.526005390056863e-08, + "logits/chosen": -2.4242732524871826, + "logits/rejected": -2.4101269245147705, + "logps/chosen": -152.4160919189453, + "logps/rejected": -176.9132537841797, + "loss": 0.6286, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -1.0079786777496338, + "rewards/margins": 0.2344062775373459, + "rewards/rejected": -1.242384910583496, + "step": 8030 + }, + { + "epoch": 1.385251550654721, + "grad_norm": 25.053556442260742, + "learning_rate": 6.516456379522468e-08, + "logits/chosen": -2.419296979904175, + "logits/rejected": -2.3870930671691895, + "logps/chosen": -165.99789428710938, + "logps/rejected": -185.56979370117188, + "loss": 0.6365, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.1034375429153442, + "rewards/margins": 0.24526703357696533, + "rewards/rejected": -1.3487045764923096, + "step": 8040 + }, + { + "epoch": 1.38697450034459, + "grad_norm": 24.719209671020508, + "learning_rate": 6.506901273737985e-08, + "logits/chosen": -2.449517011642456, + "logits/rejected": -2.43290376663208, + "logps/chosen": -154.53453063964844, + "logps/rejected": -183.0343780517578, + "loss": 0.6069, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -0.9876915216445923, + "rewards/margins": 0.2905445396900177, + "rewards/rejected": -1.2782361507415771, + "step": 8050 + }, + { + "epoch": 1.388697450034459, + "grad_norm": 28.948888778686523, + "learning_rate": 6.497340111109239e-08, + "logits/chosen": -2.4893579483032227, + "logits/rejected": -2.4637856483459473, + "logps/chosen": -163.29798889160156, + "logps/rejected": -177.02935791015625, + "loss": 0.6403, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -1.068241834640503, + "rewards/margins": 0.20585057139396667, + "rewards/rejected": -1.274092435836792, + "step": 8060 + }, + { + "epoch": 1.390420399724328, + "grad_norm": 32.282264709472656, + "learning_rate": 6.4877729300664e-08, + "logits/chosen": -2.411339521408081, + "logits/rejected": -2.3874354362487793, + "logps/chosen": -154.8437957763672, + "logps/rejected": -171.14035034179688, + "loss": 0.6303, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -1.0067319869995117, + "rewards/margins": 0.2154075801372528, + "rewards/rejected": -1.222139596939087, + "step": 8070 + }, + { + "epoch": 1.392143349414197, + "grad_norm": 26.234561920166016, + "learning_rate": 6.478199769063833e-08, + "logits/chosen": -2.4240946769714355, + "logits/rejected": -2.415292501449585, + "logps/chosen": -145.56069946289062, + "logps/rejected": -180.4043731689453, + "loss": 0.5852, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.9170554876327515, + "rewards/margins": 0.3155246675014496, + "rewards/rejected": -1.2325801849365234, + "step": 8080 + }, + { + "epoch": 1.3938662991040662, + "grad_norm": 27.18834686279297, + "learning_rate": 6.468620666579927e-08, + "logits/chosen": -2.421595335006714, + "logits/rejected": -2.397613048553467, + "logps/chosen": -151.18453979492188, + "logps/rejected": -174.41067504882812, + "loss": 0.6138, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.969436764717102, + "rewards/margins": 0.27077406644821167, + "rewards/rejected": -1.2402108907699585, + "step": 8090 + }, + { + "epoch": 1.3955892487939352, + "grad_norm": 22.773412704467773, + "learning_rate": 6.459035661116967e-08, + "logits/chosen": -2.4691033363342285, + "logits/rejected": -2.463841676712036, + "logps/chosen": -149.60824584960938, + "logps/rejected": -170.5471954345703, + "loss": 0.6377, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.94111168384552, + "rewards/margins": 0.19068074226379395, + "rewards/rejected": -1.1317923069000244, + "step": 8100 + }, + { + "epoch": 1.3973121984838044, + "grad_norm": 28.7106990814209, + "learning_rate": 6.449444791200956e-08, + "logits/chosen": -2.449850559234619, + "logits/rejected": -2.4193978309631348, + "logps/chosen": -158.18594360351562, + "logps/rejected": -175.22109985351562, + "loss": 0.6249, + "rewards/accuracies": 0.6187499761581421, + "rewards/chosen": -1.0139230489730835, + "rewards/margins": 0.23840279877185822, + "rewards/rejected": -1.2523258924484253, + "step": 8110 + }, + { + "epoch": 1.3990351481736734, + "grad_norm": 27.975040435791016, + "learning_rate": 6.43984809538147e-08, + "logits/chosen": -2.441727876663208, + "logits/rejected": -2.4223945140838623, + "logps/chosen": -155.35897827148438, + "logps/rejected": -171.23580932617188, + "loss": 0.643, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -0.9953263401985168, + "rewards/margins": 0.19734260439872742, + "rewards/rejected": -1.1926690340042114, + "step": 8120 + }, + { + "epoch": 1.4007580978635423, + "grad_norm": 31.7548770904541, + "learning_rate": 6.430245612231501e-08, + "logits/chosen": -2.4581613540649414, + "logits/rejected": -2.4431262016296387, + "logps/chosen": -147.70947265625, + "logps/rejected": -166.51431274414062, + "loss": 0.6152, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.9203088879585266, + "rewards/margins": 0.2477300465106964, + "rewards/rejected": -1.1680388450622559, + "step": 8130 + }, + { + "epoch": 1.4024810475534115, + "grad_norm": 30.289751052856445, + "learning_rate": 6.420637380347304e-08, + "logits/chosen": -2.43922758102417, + "logits/rejected": -2.4155101776123047, + "logps/chosen": -148.98468017578125, + "logps/rejected": -173.9259033203125, + "loss": 0.6175, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.952150821685791, + "rewards/margins": 0.26444974541664124, + "rewards/rejected": -1.2166005373001099, + "step": 8140 + }, + { + "epoch": 1.4042039972432805, + "grad_norm": 22.625505447387695, + "learning_rate": 6.41102343834824e-08, + "logits/chosen": -2.4767231941223145, + "logits/rejected": -2.456955671310425, + "logps/chosen": -148.2836151123047, + "logps/rejected": -172.3328857421875, + "loss": 0.6314, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.9487040638923645, + "rewards/margins": 0.2521008551120758, + "rewards/rejected": -1.2008049488067627, + "step": 8150 + }, + { + "epoch": 1.4059269469331497, + "grad_norm": 23.49427032470703, + "learning_rate": 6.40140382487662e-08, + "logits/chosen": -2.431381940841675, + "logits/rejected": -2.41220760345459, + "logps/chosen": -149.7254180908203, + "logps/rejected": -173.80136108398438, + "loss": 0.5963, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -0.9332088232040405, + "rewards/margins": 0.2952547073364258, + "rewards/rejected": -1.2284636497497559, + "step": 8160 + }, + { + "epoch": 1.4076498966230186, + "grad_norm": 28.60329246520996, + "learning_rate": 6.391778578597555e-08, + "logits/chosen": -2.480720043182373, + "logits/rejected": -2.4537298679351807, + "logps/chosen": -147.88601684570312, + "logps/rejected": -161.3191680908203, + "loss": 0.6234, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.8782271146774292, + "rewards/margins": 0.223358154296875, + "rewards/rejected": -1.1015852689743042, + "step": 8170 + }, + { + "epoch": 1.4093728463128876, + "grad_norm": 24.527114868164062, + "learning_rate": 6.38214773819879e-08, + "logits/chosen": -2.4858410358428955, + "logits/rejected": -2.467200756072998, + "logps/chosen": -154.21408081054688, + "logps/rejected": -171.1106414794922, + "loss": 0.6389, + "rewards/accuracies": 0.65625, + "rewards/chosen": -0.9663408398628235, + "rewards/margins": 0.1989414542913437, + "rewards/rejected": -1.1652823686599731, + "step": 8180 + }, + { + "epoch": 1.4110957960027566, + "grad_norm": 28.459131240844727, + "learning_rate": 6.37251134239056e-08, + "logits/chosen": -2.4083938598632812, + "logits/rejected": -2.3872787952423096, + "logps/chosen": -158.9376220703125, + "logps/rejected": -175.55117797851562, + "loss": 0.6371, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -0.9954689741134644, + "rewards/margins": 0.22491955757141113, + "rewards/rejected": -1.2203885316848755, + "step": 8190 + }, + { + "epoch": 1.4128187456926258, + "grad_norm": 33.668880462646484, + "learning_rate": 6.362869429905431e-08, + "logits/chosen": -2.4564146995544434, + "logits/rejected": -2.435072422027588, + "logps/chosen": -157.72775268554688, + "logps/rejected": -171.80508422851562, + "loss": 0.6479, + "rewards/accuracies": 0.606249988079071, + "rewards/chosen": -1.018909215927124, + "rewards/margins": 0.18646292388439178, + "rewards/rejected": -1.2053722143173218, + "step": 8200 + }, + { + "epoch": 1.414541695382495, + "grad_norm": 23.163822174072266, + "learning_rate": 6.353222039498136e-08, + "logits/chosen": -2.3753769397735596, + "logits/rejected": -2.3579654693603516, + "logps/chosen": -150.57046508789062, + "logps/rejected": -170.6091766357422, + "loss": 0.6411, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -0.9436851739883423, + "rewards/margins": 0.20695781707763672, + "rewards/rejected": -1.150642991065979, + "step": 8210 + }, + { + "epoch": 1.416264645072364, + "grad_norm": 25.262638092041016, + "learning_rate": 6.343569209945431e-08, + "logits/chosen": -2.460125684738159, + "logits/rejected": -2.4342360496520996, + "logps/chosen": -138.68099975585938, + "logps/rejected": -163.5703582763672, + "loss": 0.6106, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -0.85523521900177, + "rewards/margins": 0.2580004334449768, + "rewards/rejected": -1.1132357120513916, + "step": 8220 + }, + { + "epoch": 1.417987594762233, + "grad_norm": 25.81752586364746, + "learning_rate": 6.333910980045932e-08, + "logits/chosen": -2.445486307144165, + "logits/rejected": -2.4349093437194824, + "logps/chosen": -149.00405883789062, + "logps/rejected": -158.01571655273438, + "loss": 0.6639, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.9231271743774414, + "rewards/margins": 0.15333291888237, + "rewards/rejected": -1.0764600038528442, + "step": 8230 + }, + { + "epoch": 1.4197105444521019, + "grad_norm": 39.4451904296875, + "learning_rate": 6.324247388619967e-08, + "logits/chosen": -2.5211682319641113, + "logits/rejected": -2.495532512664795, + "logps/chosen": -144.02603149414062, + "logps/rejected": -161.476318359375, + "loss": 0.6187, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -0.8706077337265015, + "rewards/margins": 0.2339431345462799, + "rewards/rejected": -1.104550838470459, + "step": 8240 + }, + { + "epoch": 1.421433494141971, + "grad_norm": 25.24150276184082, + "learning_rate": 6.314578474509403e-08, + "logits/chosen": -2.475867748260498, + "logits/rejected": -2.4594616889953613, + "logps/chosen": -144.00283813476562, + "logps/rejected": -164.76052856445312, + "loss": 0.6127, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.8457595109939575, + "rewards/margins": 0.2605190873146057, + "rewards/rejected": -1.106278657913208, + "step": 8250 + }, + { + "epoch": 1.42315644383184, + "grad_norm": 19.108230590820312, + "learning_rate": 6.30490427657751e-08, + "logits/chosen": -2.5130741596221924, + "logits/rejected": -2.495227336883545, + "logps/chosen": -150.24356079101562, + "logps/rejected": -173.76339721679688, + "loss": 0.6116, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -0.9215757250785828, + "rewards/margins": 0.26414090394973755, + "rewards/rejected": -1.1857167482376099, + "step": 8260 + }, + { + "epoch": 1.4248793935217092, + "grad_norm": 26.77340316772461, + "learning_rate": 6.295224833708792e-08, + "logits/chosen": -2.50467586517334, + "logits/rejected": -2.493666887283325, + "logps/chosen": -150.25022888183594, + "logps/rejected": -172.16513061523438, + "loss": 0.6383, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -0.9621537923812866, + "rewards/margins": 0.21912629902362823, + "rewards/rejected": -1.1812803745269775, + "step": 8270 + }, + { + "epoch": 1.4266023432115782, + "grad_norm": 23.759750366210938, + "learning_rate": 6.285540184808836e-08, + "logits/chosen": -2.434004783630371, + "logits/rejected": -2.422375202178955, + "logps/chosen": -147.2050323486328, + "logps/rejected": -164.6346893310547, + "loss": 0.6462, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.9418224096298218, + "rewards/margins": 0.1856471747159958, + "rewards/rejected": -1.127469539642334, + "step": 8280 + }, + { + "epoch": 1.4283252929014472, + "grad_norm": 22.19275665283203, + "learning_rate": 6.275850368804156e-08, + "logits/chosen": -2.4233717918395996, + "logits/rejected": -2.3867719173431396, + "logps/chosen": -143.0716094970703, + "logps/rejected": -151.45523071289062, + "loss": 0.6448, + "rewards/accuracies": 0.65625, + "rewards/chosen": -0.8239733576774597, + "rewards/margins": 0.15603697299957275, + "rewards/rejected": -0.9800102114677429, + "step": 8290 + }, + { + "epoch": 1.4300482425913164, + "grad_norm": 20.175020217895508, + "learning_rate": 6.26615542464203e-08, + "logits/chosen": -2.5537476539611816, + "logits/rejected": -2.5354702472686768, + "logps/chosen": -147.383056640625, + "logps/rejected": -162.55567932128906, + "loss": 0.6372, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -0.8805820345878601, + "rewards/margins": 0.21152591705322266, + "rewards/rejected": -1.0921080112457275, + "step": 8300 + }, + { + "epoch": 1.4317711922811853, + "grad_norm": 21.76934814453125, + "learning_rate": 6.256455391290352e-08, + "logits/chosen": -2.4161832332611084, + "logits/rejected": -2.391357183456421, + "logps/chosen": -136.17333984375, + "logps/rejected": -149.758544921875, + "loss": 0.6289, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -0.7870678305625916, + "rewards/margins": 0.20766834914684296, + "rewards/rejected": -0.9947363138198853, + "step": 8310 + }, + { + "epoch": 1.4334941419710545, + "grad_norm": 28.782739639282227, + "learning_rate": 6.246750307737468e-08, + "logits/chosen": -2.4312474727630615, + "logits/rejected": -2.4197757244110107, + "logps/chosen": -135.2398223876953, + "logps/rejected": -160.67611694335938, + "loss": 0.6091, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -0.81584632396698, + "rewards/margins": 0.25617408752441406, + "rewards/rejected": -1.0720202922821045, + "step": 8320 + }, + { + "epoch": 1.4352170916609235, + "grad_norm": 25.49635887145996, + "learning_rate": 6.237040212992028e-08, + "logits/chosen": -2.467252016067505, + "logits/rejected": -2.4579386711120605, + "logps/chosen": -139.45643615722656, + "logps/rejected": -163.50772094726562, + "loss": 0.6373, + "rewards/accuracies": 0.606249988079071, + "rewards/chosen": -0.8992892503738403, + "rewards/margins": 0.2082190215587616, + "rewards/rejected": -1.1075081825256348, + "step": 8330 + }, + { + "epoch": 1.4369400413507925, + "grad_norm": 25.415372848510742, + "learning_rate": 6.227325146082817e-08, + "logits/chosen": -2.511993169784546, + "logits/rejected": -2.4966964721679688, + "logps/chosen": -140.6236114501953, + "logps/rejected": -160.0564422607422, + "loss": 0.6209, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.8478447794914246, + "rewards/margins": 0.23764362931251526, + "rewards/rejected": -1.0854883193969727, + "step": 8340 + }, + { + "epoch": 1.4386629910406616, + "grad_norm": 22.237586975097656, + "learning_rate": 6.217605146058612e-08, + "logits/chosen": -2.4000821113586426, + "logits/rejected": -2.3829004764556885, + "logps/chosen": -143.29495239257812, + "logps/rejected": -154.86904907226562, + "loss": 0.6592, + "rewards/accuracies": 0.5687500238418579, + "rewards/chosen": -0.8857830762863159, + "rewards/margins": 0.14205804467201233, + "rewards/rejected": -1.0278412103652954, + "step": 8350 + }, + { + "epoch": 1.4403859407305306, + "grad_norm": 32.56266403198242, + "learning_rate": 6.207880251988014e-08, + "logits/chosen": -2.383216381072998, + "logits/rejected": -2.355720281600952, + "logps/chosen": -142.70553588867188, + "logps/rejected": -159.14981079101562, + "loss": 0.6316, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.8659551739692688, + "rewards/margins": 0.21056893467903137, + "rewards/rejected": -1.0765241384506226, + "step": 8360 + }, + { + "epoch": 1.4421088904203998, + "grad_norm": 30.852476119995117, + "learning_rate": 6.198150502959296e-08, + "logits/chosen": -2.438936710357666, + "logits/rejected": -2.4242122173309326, + "logps/chosen": -138.79904174804688, + "logps/rejected": -161.7018280029297, + "loss": 0.6295, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.867357075214386, + "rewards/margins": 0.2242121398448944, + "rewards/rejected": -1.0915693044662476, + "step": 8370 + }, + { + "epoch": 1.4438318401102688, + "grad_norm": 25.91733169555664, + "learning_rate": 6.188415938080246e-08, + "logits/chosen": -2.506133794784546, + "logits/rejected": -2.4882261753082275, + "logps/chosen": -135.34335327148438, + "logps/rejected": -158.02169799804688, + "loss": 0.626, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -0.8259119987487793, + "rewards/margins": 0.22101373970508575, + "rewards/rejected": -1.0469257831573486, + "step": 8380 + }, + { + "epoch": 1.4455547898001377, + "grad_norm": 23.551240921020508, + "learning_rate": 6.178676596478007e-08, + "logits/chosen": -2.497065782546997, + "logits/rejected": -2.4661946296691895, + "logps/chosen": -137.8984832763672, + "logps/rejected": -167.7418670654297, + "loss": 0.5869, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.8056305050849915, + "rewards/margins": 0.318527489900589, + "rewards/rejected": -1.1241579055786133, + "step": 8390 + }, + { + "epoch": 1.447277739490007, + "grad_norm": 35.88808822631836, + "learning_rate": 6.168932517298927e-08, + "logits/chosen": -2.4765398502349854, + "logits/rejected": -2.457763671875, + "logps/chosen": -136.0047607421875, + "logps/rejected": -160.081787109375, + "loss": 0.61, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -0.8506819009780884, + "rewards/margins": 0.24718789756298065, + "rewards/rejected": -1.097869873046875, + "step": 8400 + }, + { + "epoch": 1.447277739490007, + "eval_logits/chosen": -2.5268397331237793, + "eval_logits/rejected": -2.51981520652771, + "eval_logps/chosen": -132.1185302734375, + "eval_logps/rejected": -148.7821044921875, + "eval_loss": 0.6540122628211975, + "eval_rewards/accuracies": 0.5980483293533325, + "eval_rewards/chosen": -0.7310304641723633, + "eval_rewards/margins": 0.12929461896419525, + "eval_rewards/rejected": -0.8603251576423645, + "eval_runtime": 383.2338, + "eval_samples_per_second": 11.231, + "eval_steps_per_second": 1.404, + "step": 8400 + }, + { + "epoch": 1.449000689179876, + "grad_norm": 24.67258071899414, + "learning_rate": 6.159183739708386e-08, + "logits/chosen": -2.4517154693603516, + "logits/rejected": -2.423412561416626, + "logps/chosen": -147.011962890625, + "logps/rejected": -169.0835418701172, + "loss": 0.5903, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -0.8814491033554077, + "rewards/margins": 0.30713149905204773, + "rewards/rejected": -1.1885805130004883, + "step": 8410 + }, + { + "epoch": 1.450723638869745, + "grad_norm": 23.347091674804688, + "learning_rate": 6.149430302890658e-08, + "logits/chosen": -2.3571114540100098, + "logits/rejected": -2.3440260887145996, + "logps/chosen": -145.65292358398438, + "logps/rejected": -164.59690856933594, + "loss": 0.6285, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -0.9319342374801636, + "rewards/margins": 0.20812121033668518, + "rewards/rejected": -1.140055537223816, + "step": 8420 + }, + { + "epoch": 1.452446588559614, + "grad_norm": 24.38920021057129, + "learning_rate": 6.139672246048741e-08, + "logits/chosen": -2.43851900100708, + "logits/rejected": -2.4240710735321045, + "logps/chosen": -146.89157104492188, + "logps/rejected": -171.93997192382812, + "loss": 0.6267, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.9251266717910767, + "rewards/margins": 0.2314465492963791, + "rewards/rejected": -1.1565730571746826, + "step": 8430 + }, + { + "epoch": 1.454169538249483, + "grad_norm": 28.91775894165039, + "learning_rate": 6.129909608404203e-08, + "logits/chosen": -2.4394712448120117, + "logits/rejected": -2.426893472671509, + "logps/chosen": -156.36746215820312, + "logps/rejected": -170.6226043701172, + "loss": 0.6387, + "rewards/accuracies": 0.65625, + "rewards/chosen": -1.0020545721054077, + "rewards/margins": 0.2015179693698883, + "rewards/rejected": -1.2035726308822632, + "step": 8440 + }, + { + "epoch": 1.4558924879393522, + "grad_norm": 34.99538040161133, + "learning_rate": 6.120142429197024e-08, + "logits/chosen": -2.3521504402160645, + "logits/rejected": -2.3453006744384766, + "logps/chosen": -151.5529327392578, + "logps/rejected": -178.8933563232422, + "loss": 0.62, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -1.0283129215240479, + "rewards/margins": 0.23831066489219666, + "rewards/rejected": -1.266623616218567, + "step": 8450 + }, + { + "epoch": 1.4576154376292212, + "grad_norm": 30.90590476989746, + "learning_rate": 6.110370747685437e-08, + "logits/chosen": -2.4394407272338867, + "logits/rejected": -2.4174625873565674, + "logps/chosen": -161.3643035888672, + "logps/rejected": -182.9066925048828, + "loss": 0.6383, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -1.0784192085266113, + "rewards/margins": 0.22087764739990234, + "rewards/rejected": -1.2992968559265137, + "step": 8460 + }, + { + "epoch": 1.4593383873190904, + "grad_norm": 27.465330123901367, + "learning_rate": 6.100594603145774e-08, + "logits/chosen": -2.446089267730713, + "logits/rejected": -2.4204440116882324, + "logps/chosen": -157.5586395263672, + "logps/rejected": -176.43116760253906, + "loss": 0.6287, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -1.0111454725265503, + "rewards/margins": 0.23245224356651306, + "rewards/rejected": -1.2435976266860962, + "step": 8470 + }, + { + "epoch": 1.4610613370089593, + "grad_norm": 33.98664474487305, + "learning_rate": 6.090814034872306e-08, + "logits/chosen": -2.4160518646240234, + "logits/rejected": -2.3934266567230225, + "logps/chosen": -155.5721435546875, + "logps/rejected": -176.04550170898438, + "loss": 0.6278, + "rewards/accuracies": 0.65625, + "rewards/chosen": -1.0139497518539429, + "rewards/margins": 0.23836950957775116, + "rewards/rejected": -1.2523192167282104, + "step": 8480 + }, + { + "epoch": 1.4627842866988283, + "grad_norm": 24.236244201660156, + "learning_rate": 6.08102908217708e-08, + "logits/chosen": -2.4765357971191406, + "logits/rejected": -2.4670848846435547, + "logps/chosen": -147.55117797851562, + "logps/rejected": -174.74295043945312, + "loss": 0.6063, + "rewards/accuracies": 0.71875, + "rewards/chosen": -0.9235634803771973, + "rewards/margins": 0.2622116208076477, + "rewards/rejected": -1.1857750415802002, + "step": 8490 + }, + { + "epoch": 1.4645072363886975, + "grad_norm": 36.001224517822266, + "learning_rate": 6.071239784389773e-08, + "logits/chosen": -2.3951425552368164, + "logits/rejected": -2.380840301513672, + "logps/chosen": -148.68881225585938, + "logps/rejected": -171.8776397705078, + "loss": 0.6088, + "rewards/accuracies": 0.65625, + "rewards/chosen": -0.9332743883132935, + "rewards/margins": 0.2613178789615631, + "rewards/rejected": -1.1945923566818237, + "step": 8500 + }, + { + "epoch": 1.4662301860785665, + "grad_norm": 30.147855758666992, + "learning_rate": 6.061446180857521e-08, + "logits/chosen": -2.414111852645874, + "logits/rejected": -2.3871891498565674, + "logps/chosen": -158.39291381835938, + "logps/rejected": -178.720947265625, + "loss": 0.6044, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -0.9931279420852661, + "rewards/margins": 0.28494516015052795, + "rewards/rejected": -1.2780730724334717, + "step": 8510 + }, + { + "epoch": 1.4679531357684357, + "grad_norm": 24.601787567138672, + "learning_rate": 6.051648310944766e-08, + "logits/chosen": -2.3954672813415527, + "logits/rejected": -2.3746702671051025, + "logps/chosen": -156.74166870117188, + "logps/rejected": -172.85977172851562, + "loss": 0.6243, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -1.0202082395553589, + "rewards/margins": 0.22919857501983643, + "rewards/rejected": -1.2494069337844849, + "step": 8520 + }, + { + "epoch": 1.4696760854583046, + "grad_norm": 26.90444564819336, + "learning_rate": 6.041846214033103e-08, + "logits/chosen": -2.386838912963867, + "logits/rejected": -2.369894504547119, + "logps/chosen": -155.7630157470703, + "logps/rejected": -168.96263122558594, + "loss": 0.6497, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -1.0040968656539917, + "rewards/margins": 0.18276354670524597, + "rewards/rejected": -1.18686044216156, + "step": 8530 + }, + { + "epoch": 1.4713990351481736, + "grad_norm": 25.653934478759766, + "learning_rate": 6.032039929521118e-08, + "logits/chosen": -2.5071861743927, + "logits/rejected": -2.493710994720459, + "logps/chosen": -149.86094665527344, + "logps/rejected": -161.53927612304688, + "loss": 0.6623, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.9416524171829224, + "rewards/margins": 0.13262325525283813, + "rewards/rejected": -1.0742757320404053, + "step": 8540 + }, + { + "epoch": 1.4731219848380428, + "grad_norm": 25.425033569335938, + "learning_rate": 6.02222949682422e-08, + "logits/chosen": -2.4378743171691895, + "logits/rejected": -2.4243133068084717, + "logps/chosen": -144.62643432617188, + "logps/rejected": -172.61434936523438, + "loss": 0.5975, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -0.8925802111625671, + "rewards/margins": 0.2852075695991516, + "rewards/rejected": -1.1777875423431396, + "step": 8550 + }, + { + "epoch": 1.4748449345279118, + "grad_norm": 30.290773391723633, + "learning_rate": 6.0124149553745e-08, + "logits/chosen": -2.5087571144104004, + "logits/rejected": -2.484095335006714, + "logps/chosen": -148.76278686523438, + "logps/rejected": -177.43338012695312, + "loss": 0.5976, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.9393836259841919, + "rewards/margins": 0.3176620304584503, + "rewards/rejected": -1.2570455074310303, + "step": 8560 + }, + { + "epoch": 1.476567884217781, + "grad_norm": 29.89841651916504, + "learning_rate": 6.002596344620556e-08, + "logits/chosen": -2.3959784507751465, + "logits/rejected": -2.3787484169006348, + "logps/chosen": -155.08749389648438, + "logps/rejected": -177.78707885742188, + "loss": 0.6126, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -1.0001827478408813, + "rewards/margins": 0.2656208276748657, + "rewards/rejected": -1.265803575515747, + "step": 8570 + }, + { + "epoch": 1.47829083390765, + "grad_norm": 24.696767807006836, + "learning_rate": 5.992773704027354e-08, + "logits/chosen": -2.453443765640259, + "logits/rejected": -2.430490493774414, + "logps/chosen": -158.04920959472656, + "logps/rejected": -188.0573272705078, + "loss": 0.5873, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -1.02683424949646, + "rewards/margins": 0.31399792432785034, + "rewards/rejected": -1.340832233428955, + "step": 8580 + }, + { + "epoch": 1.480013783597519, + "grad_norm": 28.04570198059082, + "learning_rate": 5.982947073076041e-08, + "logits/chosen": -2.449530839920044, + "logits/rejected": -2.4251859188079834, + "logps/chosen": -159.4496612548828, + "logps/rejected": -177.74026489257812, + "loss": 0.6136, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -1.008068323135376, + "rewards/margins": 0.24308781325817108, + "rewards/rejected": -1.251155972480774, + "step": 8590 + }, + { + "epoch": 1.481736733287388, + "grad_norm": 28.41834831237793, + "learning_rate": 5.973116491263818e-08, + "logits/chosen": -2.4075114727020264, + "logits/rejected": -2.3850674629211426, + "logps/chosen": -160.99905395507812, + "logps/rejected": -175.75933837890625, + "loss": 0.6547, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.0687962770462036, + "rewards/margins": 0.17903414368629456, + "rewards/rejected": -1.2478303909301758, + "step": 8600 + }, + { + "epoch": 1.483459682977257, + "grad_norm": 33.26375961303711, + "learning_rate": 5.963281998103759e-08, + "logits/chosen": -2.420893907546997, + "logits/rejected": -2.4020278453826904, + "logps/chosen": -160.13504028320312, + "logps/rejected": -179.3316650390625, + "loss": 0.6322, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -1.0473644733428955, + "rewards/margins": 0.2434740513563156, + "rewards/rejected": -1.290838599205017, + "step": 8610 + }, + { + "epoch": 1.4851826326671262, + "grad_norm": 24.831239700317383, + "learning_rate": 5.953443633124658e-08, + "logits/chosen": -2.371798038482666, + "logits/rejected": -2.3636550903320312, + "logps/chosen": -154.66403198242188, + "logps/rejected": -166.26895141601562, + "loss": 0.6485, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -0.9878658056259155, + "rewards/margins": 0.17269843816757202, + "rewards/rejected": -1.1605643033981323, + "step": 8620 + }, + { + "epoch": 1.4869055823569952, + "grad_norm": 30.51129913330078, + "learning_rate": 5.9436014358708787e-08, + "logits/chosen": -2.361635208129883, + "logits/rejected": -2.3477437496185303, + "logps/chosen": -143.97293090820312, + "logps/rejected": -171.8150177001953, + "loss": 0.5939, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -0.9073087573051453, + "rewards/margins": 0.29389339685440063, + "rewards/rejected": -1.2012020349502563, + "step": 8630 + }, + { + "epoch": 1.4886285320468642, + "grad_norm": 25.27128028869629, + "learning_rate": 5.933755445902177e-08, + "logits/chosen": -2.4640278816223145, + "logits/rejected": -2.4400534629821777, + "logps/chosen": -155.98472595214844, + "logps/rejected": -171.5356903076172, + "loss": 0.6424, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.9910284876823425, + "rewards/margins": 0.20574815571308136, + "rewards/rejected": -1.1967766284942627, + "step": 8640 + }, + { + "epoch": 1.4903514817367332, + "grad_norm": 30.846521377563477, + "learning_rate": 5.9239057027935637e-08, + "logits/chosen": -2.397857666015625, + "logits/rejected": -2.375941753387451, + "logps/chosen": -155.69415283203125, + "logps/rejected": -176.6514434814453, + "loss": 0.624, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -0.9969285130500793, + "rewards/margins": 0.2363603562116623, + "rewards/rejected": -1.2332890033721924, + "step": 8650 + }, + { + "epoch": 1.4920744314266023, + "grad_norm": 40.82417297363281, + "learning_rate": 5.914052246135127e-08, + "logits/chosen": -2.400925874710083, + "logits/rejected": -2.382791757583618, + "logps/chosen": -153.85623168945312, + "logps/rejected": -180.74575805664062, + "loss": 0.6067, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -0.9923663139343262, + "rewards/margins": 0.30433157086372375, + "rewards/rejected": -1.2966978549957275, + "step": 8660 + }, + { + "epoch": 1.4937973811164715, + "grad_norm": 28.784740447998047, + "learning_rate": 5.904195115531892e-08, + "logits/chosen": -2.4620003700256348, + "logits/rejected": -2.441227436065674, + "logps/chosen": -160.8101806640625, + "logps/rejected": -186.73927307128906, + "loss": 0.6119, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -1.079491376876831, + "rewards/margins": 0.26982641220092773, + "rewards/rejected": -1.3493177890777588, + "step": 8670 + }, + { + "epoch": 1.4955203308063405, + "grad_norm": 33.21119689941406, + "learning_rate": 5.894334350603637e-08, + "logits/chosen": -2.3955516815185547, + "logits/rejected": -2.391489028930664, + "logps/chosen": -158.9721221923828, + "logps/rejected": -177.6050262451172, + "loss": 0.6511, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -1.0687752962112427, + "rewards/margins": 0.17915226519107819, + "rewards/rejected": -1.2479274272918701, + "step": 8680 + }, + { + "epoch": 1.4972432804962095, + "grad_norm": 26.478548049926758, + "learning_rate": 5.8844699909847576e-08, + "logits/chosen": -2.412294626235962, + "logits/rejected": -2.3877341747283936, + "logps/chosen": -159.53952026367188, + "logps/rejected": -169.1040802001953, + "loss": 0.6595, + "rewards/accuracies": 0.6187499761581421, + "rewards/chosen": -1.0341455936431885, + "rewards/margins": 0.18306097388267517, + "rewards/rejected": -1.2172067165374756, + "step": 8690 + }, + { + "epoch": 1.4989662301860784, + "grad_norm": 28.747560501098633, + "learning_rate": 5.8746020763240956e-08, + "logits/chosen": -2.4410312175750732, + "logits/rejected": -2.415844440460205, + "logps/chosen": -154.33767700195312, + "logps/rejected": -167.69979858398438, + "loss": 0.6399, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.9719335436820984, + "rewards/margins": 0.1852661371231079, + "rewards/rejected": -1.1571996212005615, + "step": 8700 + }, + { + "epoch": 1.5006891798759476, + "grad_norm": 23.79633331298828, + "learning_rate": 5.8647306462847814e-08, + "logits/chosen": -2.3959717750549316, + "logits/rejected": -2.377711772918701, + "logps/chosen": -151.95407104492188, + "logps/rejected": -167.41001892089844, + "loss": 0.6573, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -0.9795981645584106, + "rewards/margins": 0.16346515715122223, + "rewards/rejected": -1.1430633068084717, + "step": 8710 + }, + { + "epoch": 1.5024121295658168, + "grad_norm": 25.051179885864258, + "learning_rate": 5.854855740544078e-08, + "logits/chosen": -2.4349405765533447, + "logits/rejected": -2.408961772918701, + "logps/chosen": -142.64926147460938, + "logps/rejected": -165.88307189941406, + "loss": 0.6049, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -0.8556321859359741, + "rewards/margins": 0.26391908526420593, + "rewards/rejected": -1.1195513010025024, + "step": 8720 + }, + { + "epoch": 1.5041350792556858, + "grad_norm": 22.280866622924805, + "learning_rate": 5.844977398793211e-08, + "logits/chosen": -2.453397274017334, + "logits/rejected": -2.421761989593506, + "logps/chosen": -145.08370971679688, + "logps/rejected": -171.266845703125, + "loss": 0.6036, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -0.9160129427909851, + "rewards/margins": 0.28230738639831543, + "rewards/rejected": -1.1983201503753662, + "step": 8730 + }, + { + "epoch": 1.5058580289455548, + "grad_norm": 29.50322151184082, + "learning_rate": 5.8350956607372284e-08, + "logits/chosen": -2.4410319328308105, + "logits/rejected": -2.433777093887329, + "logps/chosen": -153.44320678710938, + "logps/rejected": -175.6660614013672, + "loss": 0.6224, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -1.0005269050598145, + "rewards/margins": 0.23446373641490936, + "rewards/rejected": -1.2349905967712402, + "step": 8740 + }, + { + "epoch": 1.5075809786354237, + "grad_norm": 27.271137237548828, + "learning_rate": 5.825210566094817e-08, + "logits/chosen": -2.4460768699645996, + "logits/rejected": -2.424647808074951, + "logps/chosen": -148.30789184570312, + "logps/rejected": -178.0007781982422, + "loss": 0.5884, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -0.9374672770500183, + "rewards/margins": 0.32129615545272827, + "rewards/rejected": -1.2587635517120361, + "step": 8750 + }, + { + "epoch": 1.509303928325293, + "grad_norm": 23.55537986755371, + "learning_rate": 5.8153221545981634e-08, + "logits/chosen": -2.4017863273620605, + "logits/rejected": -2.393065929412842, + "logps/chosen": -151.10546875, + "logps/rejected": -180.36007690429688, + "loss": 0.6065, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -0.9841243624687195, + "rewards/margins": 0.27674761414527893, + "rewards/rejected": -1.2608718872070312, + "step": 8760 + }, + { + "epoch": 1.5110268780151621, + "grad_norm": 24.040985107421875, + "learning_rate": 5.805430465992783e-08, + "logits/chosen": -2.4128012657165527, + "logits/rejected": -2.393354892730713, + "logps/chosen": -163.99383544921875, + "logps/rejected": -184.14706420898438, + "loss": 0.618, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.081097960472107, + "rewards/margins": 0.2662922739982605, + "rewards/rejected": -1.3473902940750122, + "step": 8770 + }, + { + "epoch": 1.512749827705031, + "grad_norm": 30.036476135253906, + "learning_rate": 5.795535540037364e-08, + "logits/chosen": -2.4553608894348145, + "logits/rejected": -2.4505133628845215, + "logps/chosen": -163.72341918945312, + "logps/rejected": -187.991943359375, + "loss": 0.6352, + "rewards/accuracies": 0.606249988079071, + "rewards/chosen": -1.1354390382766724, + "rewards/margins": 0.2194855660200119, + "rewards/rejected": -1.3549244403839111, + "step": 8780 + }, + { + "epoch": 1.5144727773949, + "grad_norm": 39.66596221923828, + "learning_rate": 5.785637416503607e-08, + "logits/chosen": -2.444953680038452, + "logits/rejected": -2.420752763748169, + "logps/chosen": -165.5264434814453, + "logps/rejected": -186.53871154785156, + "loss": 0.6197, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -1.0931392908096313, + "rewards/margins": 0.26103657484054565, + "rewards/rejected": -1.3541758060455322, + "step": 8790 + }, + { + "epoch": 1.516195727084769, + "grad_norm": 29.25849151611328, + "learning_rate": 5.7757361351760625e-08, + "logits/chosen": -2.4005560874938965, + "logits/rejected": -2.3760859966278076, + "logps/chosen": -162.47555541992188, + "logps/rejected": -174.39950561523438, + "loss": 0.6575, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -1.061780571937561, + "rewards/margins": 0.16641394793987274, + "rewards/rejected": -1.2281947135925293, + "step": 8800 + }, + { + "epoch": 1.516195727084769, + "eval_logits/chosen": -2.5021514892578125, + "eval_logits/rejected": -2.4947457313537598, + "eval_logps/chosen": -142.70245361328125, + "eval_logps/rejected": -160.39002990722656, + "eval_loss": 0.6526638269424438, + "eval_rewards/accuracies": 0.5996747016906738, + "eval_rewards/chosen": -0.8368697166442871, + "eval_rewards/margins": 0.1395346075296402, + "eval_rewards/rejected": -0.9764042496681213, + "eval_runtime": 383.021, + "eval_samples_per_second": 11.237, + "eval_steps_per_second": 1.405, + "step": 8800 + }, + { + "epoch": 1.5179186767746382, + "grad_norm": 36.97319412231445, + "learning_rate": 5.765831735851978e-08, + "logits/chosen": -2.452981472015381, + "logits/rejected": -2.4250099658966064, + "logps/chosen": -153.0067596435547, + "logps/rejected": -181.1830291748047, + "loss": 0.6056, + "rewards/accuracies": 0.71875, + "rewards/chosen": -1.0083194971084595, + "rewards/margins": 0.28701165318489075, + "rewards/rejected": -1.2953310012817383, + "step": 8810 + }, + { + "epoch": 1.5196416264645074, + "grad_norm": 29.794078826904297, + "learning_rate": 5.7559242583411284e-08, + "logits/chosen": -2.4661405086517334, + "logits/rejected": -2.444133996963501, + "logps/chosen": -151.85812377929688, + "logps/rejected": -175.8976593017578, + "loss": 0.6124, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -1.002595067024231, + "rewards/margins": 0.26715952157974243, + "rewards/rejected": -1.2697546482086182, + "step": 8820 + }, + { + "epoch": 1.5213645761543764, + "grad_norm": 31.617746353149414, + "learning_rate": 5.746013742465665e-08, + "logits/chosen": -2.327963352203369, + "logits/rejected": -2.3066611289978027, + "logps/chosen": -159.5944061279297, + "logps/rejected": -183.40618896484375, + "loss": 0.6145, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -1.0419352054595947, + "rewards/margins": 0.2699545621871948, + "rewards/rejected": -1.3118897676467896, + "step": 8830 + }, + { + "epoch": 1.5230875258442453, + "grad_norm": 26.673795700073242, + "learning_rate": 5.7361002280599503e-08, + "logits/chosen": -2.377643585205078, + "logits/rejected": -2.3630967140197754, + "logps/chosen": -148.97561645507812, + "logps/rejected": -180.88720703125, + "loss": 0.5877, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -0.9541751146316528, + "rewards/margins": 0.32628554105758667, + "rewards/rejected": -1.2804607152938843, + "step": 8840 + }, + { + "epoch": 1.5248104755341143, + "grad_norm": 42.92271041870117, + "learning_rate": 5.726183754970397e-08, + "logits/chosen": -2.470742702484131, + "logits/rejected": -2.450875759124756, + "logps/chosen": -151.8878936767578, + "logps/rejected": -185.0004119873047, + "loss": 0.5973, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -0.9842556715011597, + "rewards/margins": 0.3271600902080536, + "rewards/rejected": -1.3114157915115356, + "step": 8850 + }, + { + "epoch": 1.5265334252239835, + "grad_norm": 32.4549446105957, + "learning_rate": 5.716264363055314e-08, + "logits/chosen": -2.3918213844299316, + "logits/rejected": -2.3717854022979736, + "logps/chosen": -167.3634490966797, + "logps/rejected": -192.9376678466797, + "loss": 0.6065, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -1.1311490535736084, + "rewards/margins": 0.2796618938446045, + "rewards/rejected": -1.4108108282089233, + "step": 8860 + }, + { + "epoch": 1.5282563749138525, + "grad_norm": 33.520606994628906, + "learning_rate": 5.706342092184739e-08, + "logits/chosen": -2.502729892730713, + "logits/rejected": -2.4715514183044434, + "logps/chosen": -169.19027709960938, + "logps/rejected": -197.01089477539062, + "loss": 0.5957, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -1.112261414527893, + "rewards/margins": 0.3132651150226593, + "rewards/rejected": -1.42552649974823, + "step": 8870 + }, + { + "epoch": 1.5299793246037217, + "grad_norm": 40.5619010925293, + "learning_rate": 5.696416982240282e-08, + "logits/chosen": -2.336522102355957, + "logits/rejected": -2.3161420822143555, + "logps/chosen": -182.067626953125, + "logps/rejected": -199.307861328125, + "loss": 0.6547, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -1.2775065898895264, + "rewards/margins": 0.2100718468427658, + "rewards/rejected": -1.4875786304473877, + "step": 8880 + }, + { + "epoch": 1.5317022742935906, + "grad_norm": 48.231964111328125, + "learning_rate": 5.686489073114965e-08, + "logits/chosen": -2.3405094146728516, + "logits/rejected": -2.315372943878174, + "logps/chosen": -174.9233856201172, + "logps/rejected": -196.65234375, + "loss": 0.6183, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -1.1903566122055054, + "rewards/margins": 0.2761090397834778, + "rewards/rejected": -1.4664658308029175, + "step": 8890 + }, + { + "epoch": 1.5334252239834596, + "grad_norm": 35.84044647216797, + "learning_rate": 5.676558404713061e-08, + "logits/chosen": -2.4168288707733154, + "logits/rejected": -2.3919520378112793, + "logps/chosen": -173.74485778808594, + "logps/rejected": -192.24745178222656, + "loss": 0.6406, + "rewards/accuracies": 0.606249988079071, + "rewards/chosen": -1.1773998737335205, + "rewards/margins": 0.2269444763660431, + "rewards/rejected": -1.4043442010879517, + "step": 8900 + }, + { + "epoch": 1.5351481736733288, + "grad_norm": 30.220081329345703, + "learning_rate": 5.666625016949933e-08, + "logits/chosen": -2.4153027534484863, + "logits/rejected": -2.4012513160705566, + "logps/chosen": -168.22994995117188, + "logps/rejected": -187.8097686767578, + "loss": 0.6312, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -1.1117278337478638, + "rewards/margins": 0.2432313859462738, + "rewards/rejected": -1.35495924949646, + "step": 8910 + }, + { + "epoch": 1.5368711233631978, + "grad_norm": 28.499393463134766, + "learning_rate": 5.656688949751875e-08, + "logits/chosen": -2.4883599281311035, + "logits/rejected": -2.4606895446777344, + "logps/chosen": -161.80191040039062, + "logps/rejected": -186.3115692138672, + "loss": 0.5991, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -1.0300297737121582, + "rewards/margins": 0.3197111189365387, + "rewards/rejected": -1.349740982055664, + "step": 8920 + }, + { + "epoch": 1.538594073053067, + "grad_norm": 25.9266414642334, + "learning_rate": 5.64675024305595e-08, + "logits/chosen": -2.4324049949645996, + "logits/rejected": -2.4044463634490967, + "logps/chosen": -152.42242431640625, + "logps/rejected": -172.3201904296875, + "loss": 0.6181, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -0.9726540446281433, + "rewards/margins": 0.25222453474998474, + "rewards/rejected": -1.2248785495758057, + "step": 8930 + }, + { + "epoch": 1.540317022742936, + "grad_norm": 25.37604331970215, + "learning_rate": 5.6368089368098315e-08, + "logits/chosen": -2.440627098083496, + "logits/rejected": -2.4221489429473877, + "logps/chosen": -148.80087280273438, + "logps/rejected": -168.09170532226562, + "loss": 0.6287, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.9170831441879272, + "rewards/margins": 0.21031467616558075, + "rewards/rejected": -1.1273977756500244, + "step": 8940 + }, + { + "epoch": 1.5420399724328049, + "grad_norm": 28.829172134399414, + "learning_rate": 5.626865070971638e-08, + "logits/chosen": -2.376267910003662, + "logits/rejected": -2.3775124549865723, + "logps/chosen": -148.6018829345703, + "logps/rejected": -168.5276336669922, + "loss": 0.6395, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -0.9364569783210754, + "rewards/margins": 0.1876915991306305, + "rewards/rejected": -1.1241486072540283, + "step": 8950 + }, + { + "epoch": 1.5437629221226739, + "grad_norm": 44.60697555541992, + "learning_rate": 5.616918685509783e-08, + "logits/chosen": -2.4391655921936035, + "logits/rejected": -2.409994125366211, + "logps/chosen": -166.36790466308594, + "logps/rejected": -190.92630004882812, + "loss": 0.6093, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -1.1098852157592773, + "rewards/margins": 0.2932312488555908, + "rewards/rejected": -1.4031165838241577, + "step": 8960 + }, + { + "epoch": 1.545485871812543, + "grad_norm": 42.026702880859375, + "learning_rate": 5.606969820402797e-08, + "logits/chosen": -2.4033069610595703, + "logits/rejected": -2.3744869232177734, + "logps/chosen": -165.89268493652344, + "logps/rejected": -186.732177734375, + "loss": 0.6134, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.1160341501235962, + "rewards/margins": 0.2519269287586212, + "rewards/rejected": -1.3679611682891846, + "step": 8970 + }, + { + "epoch": 1.5472088215024122, + "grad_norm": 30.750925064086914, + "learning_rate": 5.597018515639189e-08, + "logits/chosen": -2.467057943344116, + "logits/rejected": -2.4468226432800293, + "logps/chosen": -165.82440185546875, + "logps/rejected": -179.36331176757812, + "loss": 0.6693, + "rewards/accuracies": 0.581250011920929, + "rewards/chosen": -1.1067142486572266, + "rewards/margins": 0.16217520833015442, + "rewards/rejected": -1.2688895463943481, + "step": 8980 + }, + { + "epoch": 1.5489317711922812, + "grad_norm": 28.24340057373047, + "learning_rate": 5.587064811217266e-08, + "logits/chosen": -2.3896968364715576, + "logits/rejected": -2.3712821006774902, + "logps/chosen": -153.08383178710938, + "logps/rejected": -173.04818725585938, + "loss": 0.616, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -0.9606486558914185, + "rewards/margins": 0.23374095559120178, + "rewards/rejected": -1.1943897008895874, + "step": 8990 + }, + { + "epoch": 1.5506547208821502, + "grad_norm": 34.09727096557617, + "learning_rate": 5.577108747144983e-08, + "logits/chosen": -2.448643922805786, + "logits/rejected": -2.426971197128296, + "logps/chosen": -164.97779846191406, + "logps/rejected": -180.38192749023438, + "loss": 0.6403, + "rewards/accuracies": 0.59375, + "rewards/chosen": -1.0645620822906494, + "rewards/margins": 0.22409923374652863, + "rewards/rejected": -1.288661241531372, + "step": 9000 + }, + { + "epoch": 1.5523776705720191, + "grad_norm": 20.49359893798828, + "learning_rate": 5.567150363439779e-08, + "logits/chosen": -2.418567657470703, + "logits/rejected": -2.398942708969116, + "logps/chosen": -156.52621459960938, + "logps/rejected": -174.5842742919922, + "loss": 0.6264, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -0.9978251457214355, + "rewards/margins": 0.2272067368030548, + "rewards/rejected": -1.225031852722168, + "step": 9010 + }, + { + "epoch": 1.5541006202618883, + "grad_norm": 32.422637939453125, + "learning_rate": 5.557189700128414e-08, + "logits/chosen": -2.356786012649536, + "logits/rejected": -2.342362880706787, + "logps/chosen": -153.6685791015625, + "logps/rejected": -175.55043029785156, + "loss": 0.6103, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.9793645739555359, + "rewards/margins": 0.2766626179218292, + "rewards/rejected": -1.256027340888977, + "step": 9020 + }, + { + "epoch": 1.5558235699517575, + "grad_norm": 34.36164855957031, + "learning_rate": 5.547226797246817e-08, + "logits/chosen": -2.3909685611724854, + "logits/rejected": -2.3950893878936768, + "logps/chosen": -148.6129913330078, + "logps/rejected": -172.05209350585938, + "loss": 0.6279, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -0.9618427157402039, + "rewards/margins": 0.23005004227161407, + "rewards/rejected": -1.1918928623199463, + "step": 9030 + }, + { + "epoch": 1.5575465196416265, + "grad_norm": 25.065876007080078, + "learning_rate": 5.53726169483991e-08, + "logits/chosen": -2.3976728916168213, + "logits/rejected": -2.382448673248291, + "logps/chosen": -152.49664306640625, + "logps/rejected": -173.43907165527344, + "loss": 0.6379, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -1.011561632156372, + "rewards/margins": 0.20885252952575684, + "rewards/rejected": -1.2204139232635498, + "step": 9040 + }, + { + "epoch": 1.5592694693314955, + "grad_norm": 29.87236785888672, + "learning_rate": 5.5272944329614656e-08, + "logits/chosen": -2.448866367340088, + "logits/rejected": -2.4280974864959717, + "logps/chosen": -157.89830017089844, + "logps/rejected": -178.96958923339844, + "loss": 0.6355, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -1.0388907194137573, + "rewards/margins": 0.2381664514541626, + "rewards/rejected": -1.27705717086792, + "step": 9050 + }, + { + "epoch": 1.5609924190213644, + "grad_norm": 28.082740783691406, + "learning_rate": 5.517325051673928e-08, + "logits/chosen": -2.4414734840393066, + "logits/rejected": -2.423208713531494, + "logps/chosen": -156.8459014892578, + "logps/rejected": -171.1651153564453, + "loss": 0.6481, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -1.0130650997161865, + "rewards/margins": 0.17299702763557434, + "rewards/rejected": -1.186062216758728, + "step": 9060 + }, + { + "epoch": 1.5627153687112336, + "grad_norm": 30.63709259033203, + "learning_rate": 5.5073535910482625e-08, + "logits/chosen": -2.4263854026794434, + "logits/rejected": -2.407336711883545, + "logps/chosen": -144.11001586914062, + "logps/rejected": -172.8216094970703, + "loss": 0.5891, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -0.8767523765563965, + "rewards/margins": 0.31083789467811584, + "rewards/rejected": -1.18759024143219, + "step": 9070 + }, + { + "epoch": 1.5644383184011028, + "grad_norm": 28.939895629882812, + "learning_rate": 5.4973800911637966e-08, + "logits/chosen": -2.4199304580688477, + "logits/rejected": -2.4097867012023926, + "logps/chosen": -144.50460815429688, + "logps/rejected": -168.3485565185547, + "loss": 0.6292, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.9143769145011902, + "rewards/margins": 0.2185221016407013, + "rewards/rejected": -1.1328990459442139, + "step": 9080 + }, + { + "epoch": 1.5661612680909718, + "grad_norm": 26.135000228881836, + "learning_rate": 5.487404592108047e-08, + "logits/chosen": -2.395542621612549, + "logits/rejected": -2.3650925159454346, + "logps/chosen": -153.5354461669922, + "logps/rejected": -171.31285095214844, + "loss": 0.6163, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.9741457104682922, + "rewards/margins": 0.23896925151348114, + "rewards/rejected": -1.2131149768829346, + "step": 9090 + }, + { + "epoch": 1.5678842177808407, + "grad_norm": 21.869953155517578, + "learning_rate": 5.477427133976573e-08, + "logits/chosen": -2.4462099075317383, + "logits/rejected": -2.4201271533966064, + "logps/chosen": -160.56097412109375, + "logps/rejected": -169.7161865234375, + "loss": 0.665, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.0276155471801758, + "rewards/margins": 0.1408327966928482, + "rewards/rejected": -1.1684482097625732, + "step": 9100 + }, + { + "epoch": 1.5696071674707097, + "grad_norm": 23.752208709716797, + "learning_rate": 5.467447756872802e-08, + "logits/chosen": -2.414022922515869, + "logits/rejected": -2.388249158859253, + "logps/chosen": -151.4723663330078, + "logps/rejected": -174.31192016601562, + "loss": 0.6124, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -0.947963535785675, + "rewards/margins": 0.27398091554641724, + "rewards/rejected": -1.2219444513320923, + "step": 9110 + }, + { + "epoch": 1.571330117160579, + "grad_norm": 24.905351638793945, + "learning_rate": 5.457466500907877e-08, + "logits/chosen": -2.452876091003418, + "logits/rejected": -2.429744005203247, + "logps/chosen": -157.91603088378906, + "logps/rejected": -170.0849609375, + "loss": 0.6383, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.9618592262268066, + "rewards/margins": 0.18914249539375305, + "rewards/rejected": -1.1510016918182373, + "step": 9120 + }, + { + "epoch": 1.573053066850448, + "grad_norm": 26.952442169189453, + "learning_rate": 5.447483406200496e-08, + "logits/chosen": -2.4045395851135254, + "logits/rejected": -2.3840603828430176, + "logps/chosen": -156.55654907226562, + "logps/rejected": -176.99368286132812, + "loss": 0.6282, + "rewards/accuracies": 0.65625, + "rewards/chosen": -1.0117504596710205, + "rewards/margins": 0.2167857438325882, + "rewards/rejected": -1.2285362482070923, + "step": 9130 + }, + { + "epoch": 1.574776016540317, + "grad_norm": 26.6584529876709, + "learning_rate": 5.437498512876741e-08, + "logits/chosen": -2.44810152053833, + "logits/rejected": -2.4075706005096436, + "logps/chosen": -158.2299041748047, + "logps/rejected": -178.40318298339844, + "loss": 0.5943, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -0.9947940111160278, + "rewards/margins": 0.30434533953666687, + "rewards/rejected": -1.2991392612457275, + "step": 9140 + }, + { + "epoch": 1.576498966230186, + "grad_norm": 35.36668395996094, + "learning_rate": 5.427511861069932e-08, + "logits/chosen": -2.4356255531311035, + "logits/rejected": -2.407078266143799, + "logps/chosen": -166.36285400390625, + "logps/rejected": -191.15957641601562, + "loss": 0.6044, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.0679361820220947, + "rewards/margins": 0.30740636587142944, + "rewards/rejected": -1.375342607498169, + "step": 9150 + }, + { + "epoch": 1.578221915920055, + "grad_norm": 30.753293991088867, + "learning_rate": 5.417523490920448e-08, + "logits/chosen": -2.404432773590088, + "logits/rejected": -2.399017095565796, + "logps/chosen": -153.8131103515625, + "logps/rejected": -181.61129760742188, + "loss": 0.6141, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -1.0386629104614258, + "rewards/margins": 0.2596225142478943, + "rewards/rejected": -1.2982854843139648, + "step": 9160 + }, + { + "epoch": 1.5799448656099242, + "grad_norm": 24.764421463012695, + "learning_rate": 5.4075334425755824e-08, + "logits/chosen": -2.4555180072784424, + "logits/rejected": -2.4226462841033936, + "logps/chosen": -161.15907287597656, + "logps/rejected": -186.55917358398438, + "loss": 0.5987, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -1.055022954940796, + "rewards/margins": 0.32597872614860535, + "rewards/rejected": -1.3810017108917236, + "step": 9170 + }, + { + "epoch": 1.5816678152997934, + "grad_norm": 23.54519271850586, + "learning_rate": 5.397541756189369e-08, + "logits/chosen": -2.4232265949249268, + "logits/rejected": -2.4142425060272217, + "logps/chosen": -166.00881958007812, + "logps/rejected": -180.232177734375, + "loss": 0.6474, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -1.0988941192626953, + "rewards/margins": 0.18343313038349152, + "rewards/rejected": -1.2823272943496704, + "step": 9180 + }, + { + "epoch": 1.5833907649896624, + "grad_norm": 22.17339515686035, + "learning_rate": 5.387548471922425e-08, + "logits/chosen": -2.509873628616333, + "logits/rejected": -2.5046327114105225, + "logps/chosen": -158.27267456054688, + "logps/rejected": -186.60572814941406, + "loss": 0.5993, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.0305545330047607, + "rewards/margins": 0.27656883001327515, + "rewards/rejected": -1.3071234226226807, + "step": 9190 + }, + { + "epoch": 1.5851137146795313, + "grad_norm": 32.23719024658203, + "learning_rate": 5.3775536299417957e-08, + "logits/chosen": -2.4358341693878174, + "logits/rejected": -2.418269395828247, + "logps/chosen": -163.1688995361328, + "logps/rejected": -189.51535034179688, + "loss": 0.5969, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.0882149934768677, + "rewards/margins": 0.301506370306015, + "rewards/rejected": -1.389721393585205, + "step": 9200 + }, + { + "epoch": 1.5851137146795313, + "eval_logits/chosen": -2.4746241569519043, + "eval_logits/rejected": -2.466055154800415, + "eval_logps/chosen": -148.2314910888672, + "eval_logps/rejected": -166.408935546875, + "eval_loss": 0.6516256332397461, + "eval_rewards/accuracies": 0.6101301312446594, + "eval_rewards/chosen": -0.8921600580215454, + "eval_rewards/margins": 0.14443333446979523, + "eval_rewards/rejected": -1.0365933179855347, + "eval_runtime": 382.9379, + "eval_samples_per_second": 11.239, + "eval_steps_per_second": 1.405, + "step": 9200 + }, + { + "epoch": 1.5868366643694003, + "grad_norm": 39.60490036010742, + "learning_rate": 5.3675572704207826e-08, + "logits/chosen": -2.3610215187072754, + "logits/rejected": -2.3336758613586426, + "logps/chosen": -170.14991760253906, + "logps/rejected": -187.44143676757812, + "loss": 0.6331, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.1286404132843018, + "rewards/margins": 0.23823793232440948, + "rewards/rejected": -1.3668782711029053, + "step": 9210 + }, + { + "epoch": 1.5885596140592695, + "grad_norm": 27.61195182800293, + "learning_rate": 5.3575594335387876e-08, + "logits/chosen": -2.404003381729126, + "logits/rejected": -2.3874430656433105, + "logps/chosen": -159.68882751464844, + "logps/rejected": -183.86068725585938, + "loss": 0.6165, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -1.0844800472259521, + "rewards/margins": 0.26251906156539917, + "rewards/rejected": -1.346998929977417, + "step": 9220 + }, + { + "epoch": 1.5902825637491387, + "grad_norm": 27.68160629272461, + "learning_rate": 5.347560159481153e-08, + "logits/chosen": -2.3483078479766846, + "logits/rejected": -2.3337655067443848, + "logps/chosen": -156.78604125976562, + "logps/rejected": -187.99929809570312, + "loss": 0.5975, + "rewards/accuracies": 0.71875, + "rewards/chosen": -1.0658495426177979, + "rewards/margins": 0.28840649127960205, + "rewards/rejected": -1.3542559146881104, + "step": 9230 + }, + { + "epoch": 1.5920055134390076, + "grad_norm": 33.5504035949707, + "learning_rate": 5.337559488438994e-08, + "logits/chosen": -2.3742103576660156, + "logits/rejected": -2.364173173904419, + "logps/chosen": -171.8185272216797, + "logps/rejected": -199.3497314453125, + "loss": 0.6049, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -1.143182635307312, + "rewards/margins": 0.3130360543727875, + "rewards/rejected": -1.4562186002731323, + "step": 9240 + }, + { + "epoch": 1.5937284631288766, + "grad_norm": 24.97473907470703, + "learning_rate": 5.327557460609043e-08, + "logits/chosen": -2.362847328186035, + "logits/rejected": -2.339869499206543, + "logps/chosen": -164.04771423339844, + "logps/rejected": -186.96725463867188, + "loss": 0.627, + "rewards/accuracies": 0.6187499761581421, + "rewards/chosen": -1.077368974685669, + "rewards/margins": 0.2501301169395447, + "rewards/rejected": -1.3274990320205688, + "step": 9250 + }, + { + "epoch": 1.5954514128187456, + "grad_norm": 27.14133071899414, + "learning_rate": 5.317554116193488e-08, + "logits/chosen": -2.3598151206970215, + "logits/rejected": -2.3444392681121826, + "logps/chosen": -168.51766967773438, + "logps/rejected": -191.16806030273438, + "loss": 0.6361, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -1.1996451616287231, + "rewards/margins": 0.23302045464515686, + "rewards/rejected": -1.4326655864715576, + "step": 9260 + }, + { + "epoch": 1.5971743625086148, + "grad_norm": 24.20354652404785, + "learning_rate": 5.307549495399804e-08, + "logits/chosen": -2.428321123123169, + "logits/rejected": -2.4023356437683105, + "logps/chosen": -171.56423950195312, + "logps/rejected": -188.52224731445312, + "loss": 0.6295, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -1.1535447835922241, + "rewards/margins": 0.22041280567646027, + "rewards/rejected": -1.373957633972168, + "step": 9270 + }, + { + "epoch": 1.598897312198484, + "grad_norm": 37.25349807739258, + "learning_rate": 5.2975436384406e-08, + "logits/chosen": -2.421985387802124, + "logits/rejected": -2.403449296951294, + "logps/chosen": -157.58123779296875, + "logps/rejected": -184.67398071289062, + "loss": 0.6101, + "rewards/accuracies": 0.6187499761581421, + "rewards/chosen": -1.0259568691253662, + "rewards/margins": 0.2917240858078003, + "rewards/rejected": -1.3176809549331665, + "step": 9280 + }, + { + "epoch": 1.600620261888353, + "grad_norm": 25.58997917175293, + "learning_rate": 5.287536585533453e-08, + "logits/chosen": -2.3553547859191895, + "logits/rejected": -2.3330671787261963, + "logps/chosen": -152.5822296142578, + "logps/rejected": -170.2565155029297, + "loss": 0.627, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -0.9716187715530396, + "rewards/margins": 0.23068185150623322, + "rewards/rejected": -1.2023006677627563, + "step": 9290 + }, + { + "epoch": 1.602343211578222, + "grad_norm": 34.46986770629883, + "learning_rate": 5.2775283769007464e-08, + "logits/chosen": -2.402198314666748, + "logits/rejected": -2.39088773727417, + "logps/chosen": -156.928955078125, + "logps/rejected": -186.67454528808594, + "loss": 0.6059, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -1.019683599472046, + "rewards/margins": 0.2989785075187683, + "rewards/rejected": -1.318662166595459, + "step": 9300 + }, + { + "epoch": 1.6040661612680909, + "grad_norm": 30.7072696685791, + "learning_rate": 5.267519052769507e-08, + "logits/chosen": -2.3956387042999268, + "logits/rejected": -2.3802638053894043, + "logps/chosen": -164.15245056152344, + "logps/rejected": -180.8313446044922, + "loss": 0.6306, + "rewards/accuracies": 0.65625, + "rewards/chosen": -1.0937596559524536, + "rewards/margins": 0.21840448677539825, + "rewards/rejected": -1.312164068222046, + "step": 9310 + }, + { + "epoch": 1.60578911095796, + "grad_norm": 27.55194664001465, + "learning_rate": 5.257508653371252e-08, + "logits/chosen": -2.456815242767334, + "logits/rejected": -2.4346182346343994, + "logps/chosen": -155.98793029785156, + "logps/rejected": -186.184326171875, + "loss": 0.6075, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -1.0377084016799927, + "rewards/margins": 0.29293209314346313, + "rewards/rejected": -1.330640435218811, + "step": 9320 + }, + { + "epoch": 1.607512060647829, + "grad_norm": 31.598167419433594, + "learning_rate": 5.2474972189418096e-08, + "logits/chosen": -2.4160571098327637, + "logits/rejected": -2.3925347328186035, + "logps/chosen": -164.6399688720703, + "logps/rejected": -189.77328491210938, + "loss": 0.6075, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -1.061995267868042, + "rewards/margins": 0.2920532822608948, + "rewards/rejected": -1.354048490524292, + "step": 9330 + }, + { + "epoch": 1.6092350103376982, + "grad_norm": 25.802043914794922, + "learning_rate": 5.237484789721178e-08, + "logits/chosen": -2.3657262325286865, + "logits/rejected": -2.349848747253418, + "logps/chosen": -158.73101806640625, + "logps/rejected": -189.5824737548828, + "loss": 0.5898, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.0642492771148682, + "rewards/margins": 0.327562153339386, + "rewards/rejected": -1.3918113708496094, + "step": 9340 + }, + { + "epoch": 1.6109579600275672, + "grad_norm": 44.81790542602539, + "learning_rate": 5.227471405953352e-08, + "logits/chosen": -2.3814568519592285, + "logits/rejected": -2.3522324562072754, + "logps/chosen": -156.79432678222656, + "logps/rejected": -183.58204650878906, + "loss": 0.5998, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -1.0345754623413086, + "rewards/margins": 0.29350045323371887, + "rewards/rejected": -1.328075885772705, + "step": 9350 + }, + { + "epoch": 1.6126809097174362, + "grad_norm": 29.344892501831055, + "learning_rate": 5.217457107886159e-08, + "logits/chosen": -2.438901901245117, + "logits/rejected": -2.406013011932373, + "logps/chosen": -173.286376953125, + "logps/rejected": -199.4598846435547, + "loss": 0.6025, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -1.1483434438705444, + "rewards/margins": 0.32900819182395935, + "rewards/rejected": -1.4773516654968262, + "step": 9360 + }, + { + "epoch": 1.6144038594073054, + "grad_norm": 42.48180389404297, + "learning_rate": 5.207441935771104e-08, + "logits/chosen": -2.430676221847534, + "logits/rejected": -2.3925089836120605, + "logps/chosen": -171.08541870117188, + "logps/rejected": -199.71249389648438, + "loss": 0.5897, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -1.1605144739151, + "rewards/margins": 0.3269973695278168, + "rewards/rejected": -1.4875118732452393, + "step": 9370 + }, + { + "epoch": 1.6161268090971743, + "grad_norm": 34.442657470703125, + "learning_rate": 5.197425929863204e-08, + "logits/chosen": -2.4080941677093506, + "logits/rejected": -2.3933424949645996, + "logps/chosen": -172.6584014892578, + "logps/rejected": -193.78787231445312, + "loss": 0.6481, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -1.1749662160873413, + "rewards/margins": 0.22580060362815857, + "rewards/rejected": -1.4007668495178223, + "step": 9380 + }, + { + "epoch": 1.6178497587870435, + "grad_norm": 27.828460693359375, + "learning_rate": 5.1874091304208314e-08, + "logits/chosen": -2.3001291751861572, + "logits/rejected": -2.2786030769348145, + "logps/chosen": -168.0280303955078, + "logps/rejected": -202.01181030273438, + "loss": 0.5829, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -1.141455888748169, + "rewards/margins": 0.340496689081192, + "rewards/rejected": -1.481952428817749, + "step": 9390 + }, + { + "epoch": 1.6195727084769125, + "grad_norm": 23.844345092773438, + "learning_rate": 5.17739157770554e-08, + "logits/chosen": -2.3625731468200684, + "logits/rejected": -2.3439574241638184, + "logps/chosen": -166.90896606445312, + "logps/rejected": -188.41632080078125, + "loss": 0.6401, + "rewards/accuracies": 0.581250011920929, + "rewards/chosen": -1.112298607826233, + "rewards/margins": 0.23041932284832, + "rewards/rejected": -1.3427180051803589, + "step": 9400 + }, + { + "epoch": 1.6212956581667815, + "grad_norm": 31.775917053222656, + "learning_rate": 5.167373311981922e-08, + "logits/chosen": -2.359849452972412, + "logits/rejected": -2.3399579524993896, + "logps/chosen": -170.50335693359375, + "logps/rejected": -194.62030029296875, + "loss": 0.6208, + "rewards/accuracies": 0.606249988079071, + "rewards/chosen": -1.1581764221191406, + "rewards/margins": 0.27720901370048523, + "rewards/rejected": -1.4353853464126587, + "step": 9410 + }, + { + "epoch": 1.6230186078566504, + "grad_norm": 60.274497985839844, + "learning_rate": 5.157354373517425e-08, + "logits/chosen": -2.380483388900757, + "logits/rejected": -2.3661274909973145, + "logps/chosen": -184.9801483154297, + "logps/rejected": -195.0530242919922, + "loss": 0.6833, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -1.2929103374481201, + "rewards/margins": 0.14616604149341583, + "rewards/rejected": -1.4390761852264404, + "step": 9420 + }, + { + "epoch": 1.6247415575465196, + "grad_norm": 28.30097198486328, + "learning_rate": 5.147334802582208e-08, + "logits/chosen": -2.361516237258911, + "logits/rejected": -2.341750383377075, + "logps/chosen": -170.9657440185547, + "logps/rejected": -186.64126586914062, + "loss": 0.6554, + "rewards/accuracies": 0.59375, + "rewards/chosen": -1.1667261123657227, + "rewards/margins": 0.22141973674297333, + "rewards/rejected": -1.3881456851959229, + "step": 9430 + }, + { + "epoch": 1.6264645072363888, + "grad_norm": 39.51325225830078, + "learning_rate": 5.1373146394489706e-08, + "logits/chosen": -2.3611459732055664, + "logits/rejected": -2.3512301445007324, + "logps/chosen": -155.964599609375, + "logps/rejected": -182.50051879882812, + "loss": 0.6236, + "rewards/accuracies": 0.59375, + "rewards/chosen": -1.0606611967086792, + "rewards/margins": 0.2499229609966278, + "rewards/rejected": -1.310584306716919, + "step": 9440 + }, + { + "epoch": 1.6281874569262578, + "grad_norm": 49.02311706542969, + "learning_rate": 5.127293924392787e-08, + "logits/chosen": -2.474292039871216, + "logits/rejected": -2.460292100906372, + "logps/chosen": -170.9194793701172, + "logps/rejected": -184.9275360107422, + "loss": 0.6621, + "rewards/accuracies": 0.59375, + "rewards/chosen": -1.168145775794983, + "rewards/margins": 0.18423514068126678, + "rewards/rejected": -1.3523808717727661, + "step": 9450 + }, + { + "epoch": 1.6299104066161267, + "grad_norm": 28.96719741821289, + "learning_rate": 5.117272697690961e-08, + "logits/chosen": -2.386169672012329, + "logits/rejected": -2.3760600090026855, + "logps/chosen": -152.43272399902344, + "logps/rejected": -196.67221069335938, + "loss": 0.5304, + "rewards/accuracies": 0.793749988079071, + "rewards/chosen": -0.9874264001846313, + "rewards/margins": 0.4499265253543854, + "rewards/rejected": -1.4373528957366943, + "step": 9460 + }, + { + "epoch": 1.6316333563059957, + "grad_norm": 31.411161422729492, + "learning_rate": 5.10725099962284e-08, + "logits/chosen": -2.2730417251586914, + "logits/rejected": -2.2473788261413574, + "logps/chosen": -161.56280517578125, + "logps/rejected": -181.9588623046875, + "loss": 0.6425, + "rewards/accuracies": 0.581250011920929, + "rewards/chosen": -1.0859378576278687, + "rewards/margins": 0.2371862232685089, + "rewards/rejected": -1.3231239318847656, + "step": 9470 + }, + { + "epoch": 1.633356305995865, + "grad_norm": 30.01712417602539, + "learning_rate": 5.0972288704696764e-08, + "logits/chosen": -2.3781850337982178, + "logits/rejected": -2.3473598957061768, + "logps/chosen": -173.58242797851562, + "logps/rejected": -196.97030639648438, + "loss": 0.6137, + "rewards/accuracies": 0.65625, + "rewards/chosen": -1.188126802444458, + "rewards/margins": 0.27098771929740906, + "rewards/rejected": -1.4591143131256104, + "step": 9480 + }, + { + "epoch": 1.635079255685734, + "grad_norm": 31.82417869567871, + "learning_rate": 5.0872063505144494e-08, + "logits/chosen": -2.3453164100646973, + "logits/rejected": -2.3215858936309814, + "logps/chosen": -176.57174682617188, + "logps/rejected": -201.98179626464844, + "loss": 0.6095, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -1.2095147371292114, + "rewards/margins": 0.29533299803733826, + "rewards/rejected": -1.504847764968872, + "step": 9490 + }, + { + "epoch": 1.636802205375603, + "grad_norm": 29.027536392211914, + "learning_rate": 5.077183480041711e-08, + "logits/chosen": -2.3935375213623047, + "logits/rejected": -2.378740072250366, + "logps/chosen": -171.82821655273438, + "logps/rejected": -199.50689697265625, + "loss": 0.6035, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -1.169775366783142, + "rewards/margins": 0.29034778475761414, + "rewards/rejected": -1.4601233005523682, + "step": 9500 + }, + { + "epoch": 1.638525155065472, + "grad_norm": 36.75428009033203, + "learning_rate": 5.067160299337423e-08, + "logits/chosen": -2.3084945678710938, + "logits/rejected": -2.2897262573242188, + "logps/chosen": -177.8446502685547, + "logps/rejected": -214.07833862304688, + "loss": 0.5932, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -1.2567390203475952, + "rewards/margins": 0.35475316643714905, + "rewards/rejected": -1.6114921569824219, + "step": 9510 + }, + { + "epoch": 1.640248104755341, + "grad_norm": 28.859760284423828, + "learning_rate": 5.0571368486887913e-08, + "logits/chosen": -2.4560189247131348, + "logits/rejected": -2.4452311992645264, + "logps/chosen": -187.6722869873047, + "logps/rejected": -218.9191436767578, + "loss": 0.6331, + "rewards/accuracies": 0.65625, + "rewards/chosen": -1.3441131114959717, + "rewards/margins": 0.2731940746307373, + "rewards/rejected": -1.6173073053359985, + "step": 9520 + }, + { + "epoch": 1.6419710544452102, + "grad_norm": 30.882644653320312, + "learning_rate": 5.047113168384112e-08, + "logits/chosen": -2.3941006660461426, + "logits/rejected": -2.3643088340759277, + "logps/chosen": -185.5674591064453, + "logps/rejected": -214.42153930664062, + "loss": 0.6052, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -1.311025619506836, + "rewards/margins": 0.3395896553993225, + "rewards/rejected": -1.6506150960922241, + "step": 9530 + }, + { + "epoch": 1.6436940041350794, + "grad_norm": 26.167566299438477, + "learning_rate": 5.037089298712597e-08, + "logits/chosen": -2.3372859954833984, + "logits/rejected": -2.311713933944702, + "logps/chosen": -179.64578247070312, + "logps/rejected": -211.289794921875, + "loss": 0.6028, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -1.246484637260437, + "rewards/margins": 0.34223586320877075, + "rewards/rejected": -1.5887203216552734, + "step": 9540 + }, + { + "epoch": 1.6454169538249483, + "grad_norm": 30.182079315185547, + "learning_rate": 5.027065279964226e-08, + "logits/chosen": -2.3960721492767334, + "logits/rejected": -2.393296003341675, + "logps/chosen": -173.44830322265625, + "logps/rejected": -198.02462768554688, + "loss": 0.6382, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -1.1929553747177124, + "rewards/margins": 0.23101527988910675, + "rewards/rejected": -1.4239708185195923, + "step": 9550 + }, + { + "epoch": 1.6471399035148173, + "grad_norm": 45.042503356933594, + "learning_rate": 5.017041152429572e-08, + "logits/chosen": -2.4475998878479004, + "logits/rejected": -2.438934803009033, + "logps/chosen": -168.21823120117188, + "logps/rejected": -186.36083984375, + "loss": 0.6511, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.1365259885787964, + "rewards/margins": 0.20526358485221863, + "rewards/rejected": -1.341789722442627, + "step": 9560 + }, + { + "epoch": 1.6488628532046863, + "grad_norm": 23.81475257873535, + "learning_rate": 5.00701695639965e-08, + "logits/chosen": -2.3587894439697266, + "logits/rejected": -2.3438403606414795, + "logps/chosen": -161.5234832763672, + "logps/rejected": -186.20321655273438, + "loss": 0.6293, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -1.0926761627197266, + "rewards/margins": 0.26515525579452515, + "rewards/rejected": -1.3578314781188965, + "step": 9570 + }, + { + "epoch": 1.6505858028945555, + "grad_norm": 33.94333267211914, + "learning_rate": 4.99699273216575e-08, + "logits/chosen": -2.4317760467529297, + "logits/rejected": -2.414454460144043, + "logps/chosen": -159.59573364257812, + "logps/rejected": -178.28103637695312, + "loss": 0.65, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -1.0440845489501953, + "rewards/margins": 0.17422762513160706, + "rewards/rejected": -1.2183122634887695, + "step": 9580 + }, + { + "epoch": 1.6523087525844247, + "grad_norm": 24.0312442779541, + "learning_rate": 4.986968520019272e-08, + "logits/chosen": -2.5148632526397705, + "logits/rejected": -2.495476245880127, + "logps/chosen": -157.68704223632812, + "logps/rejected": -177.65390014648438, + "loss": 0.6358, + "rewards/accuracies": 0.6187499761581421, + "rewards/chosen": -1.0029971599578857, + "rewards/margins": 0.21555516123771667, + "rewards/rejected": -1.2185523509979248, + "step": 9590 + }, + { + "epoch": 1.6540317022742936, + "grad_norm": 28.904621124267578, + "learning_rate": 4.9769443602515724e-08, + "logits/chosen": -2.390204668045044, + "logits/rejected": -2.3624749183654785, + "logps/chosen": -157.79043579101562, + "logps/rejected": -180.45880126953125, + "loss": 0.6211, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -1.0174524784088135, + "rewards/margins": 0.2713993489742279, + "rewards/rejected": -1.2888518571853638, + "step": 9600 + }, + { + "epoch": 1.6540317022742936, + "eval_logits/chosen": -2.480398416519165, + "eval_logits/rejected": -2.4724621772766113, + "eval_logps/chosen": -137.76979064941406, + "eval_logps/rejected": -155.23399353027344, + "eval_loss": 0.6525858640670776, + "eval_rewards/accuracies": 0.609433114528656, + "eval_rewards/chosen": -0.7875431180000305, + "eval_rewards/margins": 0.13730084896087646, + "eval_rewards/rejected": -0.9248440265655518, + "eval_runtime": 383.5461, + "eval_samples_per_second": 11.222, + "eval_steps_per_second": 1.403, + "step": 9600 + }, + { + "epoch": 1.6557546519641626, + "grad_norm": 27.6368408203125, + "learning_rate": 4.9669202931537895e-08, + "logits/chosen": -2.393742799758911, + "logits/rejected": -2.379835605621338, + "logps/chosen": -147.69479370117188, + "logps/rejected": -173.48165893554688, + "loss": 0.5951, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -0.8961979746818542, + "rewards/margins": 0.2986195981502533, + "rewards/rejected": -1.1948175430297852, + "step": 9610 + }, + { + "epoch": 1.6574776016540316, + "grad_norm": 27.73402214050293, + "learning_rate": 4.956896359016698e-08, + "logits/chosen": -2.472517251968384, + "logits/rejected": -2.4543066024780273, + "logps/chosen": -151.52615356445312, + "logps/rejected": -170.35833740234375, + "loss": 0.6451, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -0.9573991894721985, + "rewards/margins": 0.1973901093006134, + "rewards/rejected": -1.1547894477844238, + "step": 9620 + }, + { + "epoch": 1.6592005513439008, + "grad_norm": 34.30169677734375, + "learning_rate": 4.946872598130531e-08, + "logits/chosen": -2.41727352142334, + "logits/rejected": -2.3968358039855957, + "logps/chosen": -156.9556884765625, + "logps/rejected": -179.29806518554688, + "loss": 0.6254, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -1.012807846069336, + "rewards/margins": 0.2528161108493805, + "rewards/rejected": -1.265623927116394, + "step": 9630 + }, + { + "epoch": 1.66092350103377, + "grad_norm": 30.467981338500977, + "learning_rate": 4.9368490507848285e-08, + "logits/chosen": -2.4359209537506104, + "logits/rejected": -2.4101357460021973, + "logps/chosen": -153.869873046875, + "logps/rejected": -172.2772979736328, + "loss": 0.6071, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.9647769927978516, + "rewards/margins": 0.26178082823753357, + "rewards/rejected": -1.2265578508377075, + "step": 9640 + }, + { + "epoch": 1.662646450723639, + "grad_norm": 35.35158920288086, + "learning_rate": 4.926825757268276e-08, + "logits/chosen": -2.3688926696777344, + "logits/rejected": -2.3474221229553223, + "logps/chosen": -154.3858184814453, + "logps/rejected": -171.7613067626953, + "loss": 0.6398, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -0.9814737439155579, + "rewards/margins": 0.1964212954044342, + "rewards/rejected": -1.1778948307037354, + "step": 9650 + }, + { + "epoch": 1.664369400413508, + "grad_norm": 28.72515296936035, + "learning_rate": 4.916802757868529e-08, + "logits/chosen": -2.364741802215576, + "logits/rejected": -2.3553318977355957, + "logps/chosen": -146.39309692382812, + "logps/rejected": -177.4051513671875, + "loss": 0.5995, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.9462183713912964, + "rewards/margins": 0.2969276010990143, + "rewards/rejected": -1.2431461811065674, + "step": 9660 + }, + { + "epoch": 1.6660923501033769, + "grad_norm": 26.201595306396484, + "learning_rate": 4.906780092872069e-08, + "logits/chosen": -2.45100474357605, + "logits/rejected": -2.4262044429779053, + "logps/chosen": -157.6940460205078, + "logps/rejected": -181.42202758789062, + "loss": 0.6079, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -0.9933892488479614, + "rewards/margins": 0.29405349493026733, + "rewards/rejected": -1.2874428033828735, + "step": 9670 + }, + { + "epoch": 1.667815299793246, + "grad_norm": 24.7778263092041, + "learning_rate": 4.89675780256403e-08, + "logits/chosen": -2.3970322608947754, + "logits/rejected": -2.3876750469207764, + "logps/chosen": -158.8399658203125, + "logps/rejected": -172.13357543945312, + "loss": 0.6546, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -1.0322105884552002, + "rewards/margins": 0.17332449555397034, + "rewards/rejected": -1.2055351734161377, + "step": 9680 + }, + { + "epoch": 1.6695382494831152, + "grad_norm": 38.30397033691406, + "learning_rate": 4.886735927228044e-08, + "logits/chosen": -2.33786940574646, + "logits/rejected": -2.3223893642425537, + "logps/chosen": -158.79647827148438, + "logps/rejected": -175.82296752929688, + "loss": 0.6322, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -1.023982286453247, + "rewards/margins": 0.2030734121799469, + "rewards/rejected": -1.227055549621582, + "step": 9690 + }, + { + "epoch": 1.6712611991729842, + "grad_norm": 63.075931549072266, + "learning_rate": 4.876714507146066e-08, + "logits/chosen": -2.3727738857269287, + "logits/rejected": -2.3485240936279297, + "logps/chosen": -151.89422607421875, + "logps/rejected": -170.76956176757812, + "loss": 0.6475, + "rewards/accuracies": 0.606249988079071, + "rewards/chosen": -0.9858682751655579, + "rewards/margins": 0.21038445830345154, + "rewards/rejected": -1.1962525844573975, + "step": 9700 + }, + { + "epoch": 1.6729841488628532, + "grad_norm": 34.793338775634766, + "learning_rate": 4.86669358259823e-08, + "logits/chosen": -2.299872875213623, + "logits/rejected": -2.292018413543701, + "logps/chosen": -147.75247192382812, + "logps/rejected": -167.35336303710938, + "loss": 0.6316, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -0.9372814297676086, + "rewards/margins": 0.22653648257255554, + "rewards/rejected": -1.163818120956421, + "step": 9710 + }, + { + "epoch": 1.6747070985527222, + "grad_norm": 28.445850372314453, + "learning_rate": 4.856673193862677e-08, + "logits/chosen": -2.4229607582092285, + "logits/rejected": -2.409935235977173, + "logps/chosen": -147.39321899414062, + "logps/rejected": -161.4310302734375, + "loss": 0.6444, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -0.923819899559021, + "rewards/margins": 0.18175753951072693, + "rewards/rejected": -1.1055775880813599, + "step": 9720 + }, + { + "epoch": 1.6764300482425913, + "grad_norm": 25.407543182373047, + "learning_rate": 4.846653381215391e-08, + "logits/chosen": -2.4299368858337402, + "logits/rejected": -2.4185292720794678, + "logps/chosen": -141.1247100830078, + "logps/rejected": -164.10366821289062, + "loss": 0.6213, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -0.8609121441841125, + "rewards/margins": 0.2429952174425125, + "rewards/rejected": -1.1039073467254639, + "step": 9730 + }, + { + "epoch": 1.6781529979324605, + "grad_norm": 27.56557273864746, + "learning_rate": 4.836634184930043e-08, + "logits/chosen": -2.4252548217773438, + "logits/rejected": -2.410492420196533, + "logps/chosen": -141.70298767089844, + "logps/rejected": -158.42745971679688, + "loss": 0.636, + "rewards/accuracies": 0.6187499761581421, + "rewards/chosen": -0.861044704914093, + "rewards/margins": 0.1996491253376007, + "rewards/rejected": -1.0606937408447266, + "step": 9740 + }, + { + "epoch": 1.6798759476223295, + "grad_norm": 24.753618240356445, + "learning_rate": 4.826615645277823e-08, + "logits/chosen": -2.436265230178833, + "logits/rejected": -2.3932459354400635, + "logps/chosen": -149.7669219970703, + "logps/rejected": -166.84420776367188, + "loss": 0.6266, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.9381783604621887, + "rewards/margins": 0.24333791434764862, + "rewards/rejected": -1.181516170501709, + "step": 9750 + }, + { + "epoch": 1.6815988973121985, + "grad_norm": 27.361852645874023, + "learning_rate": 4.8165978025272865e-08, + "logits/chosen": -2.4168701171875, + "logits/rejected": -2.3895132541656494, + "logps/chosen": -144.3466796875, + "logps/rejected": -165.06533813476562, + "loss": 0.6197, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.9014531970024109, + "rewards/margins": 0.24613547325134277, + "rewards/rejected": -1.1475884914398193, + "step": 9760 + }, + { + "epoch": 1.6833218470020674, + "grad_norm": 27.26271629333496, + "learning_rate": 4.806580696944186e-08, + "logits/chosen": -2.3578128814697266, + "logits/rejected": -2.338855266571045, + "logps/chosen": -147.1383819580078, + "logps/rejected": -170.07638549804688, + "loss": 0.6291, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -0.9200402498245239, + "rewards/margins": 0.24137751758098602, + "rewards/rejected": -1.1614177227020264, + "step": 9770 + }, + { + "epoch": 1.6850447966919366, + "grad_norm": 30.34505271911621, + "learning_rate": 4.796564368791311e-08, + "logits/chosen": -2.393284559249878, + "logits/rejected": -2.350308895111084, + "logps/chosen": -159.84327697753906, + "logps/rejected": -179.5383758544922, + "loss": 0.6137, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -0.9826564788818359, + "rewards/margins": 0.3017580807209015, + "rewards/rejected": -1.284414529800415, + "step": 9780 + }, + { + "epoch": 1.6867677463818056, + "grad_norm": 26.540788650512695, + "learning_rate": 4.786548858328325e-08, + "logits/chosen": -2.403374195098877, + "logits/rejected": -2.395364999771118, + "logps/chosen": -151.01055908203125, + "logps/rejected": -186.31092834472656, + "loss": 0.5993, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.9662774801254272, + "rewards/margins": 0.33149346709251404, + "rewards/rejected": -1.2977708578109741, + "step": 9790 + }, + { + "epoch": 1.6884906960716748, + "grad_norm": 32.02655029296875, + "learning_rate": 4.7765342058116057e-08, + "logits/chosen": -2.432471513748169, + "logits/rejected": -2.403170347213745, + "logps/chosen": -157.26419067382812, + "logps/rejected": -178.321044921875, + "loss": 0.6271, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -0.986743152141571, + "rewards/margins": 0.26471954584121704, + "rewards/rejected": -1.251462697982788, + "step": 9800 + }, + { + "epoch": 1.6902136457615438, + "grad_norm": 26.325098037719727, + "learning_rate": 4.766520451494082e-08, + "logits/chosen": -2.39579176902771, + "logits/rejected": -2.3640379905700684, + "logps/chosen": -156.1612091064453, + "logps/rejected": -179.22048950195312, + "loss": 0.6284, + "rewards/accuracies": 0.65625, + "rewards/chosen": -0.9938696622848511, + "rewards/margins": 0.27399882674217224, + "rewards/rejected": -1.2678686380386353, + "step": 9810 + }, + { + "epoch": 1.6919365954514127, + "grad_norm": 28.532209396362305, + "learning_rate": 4.756507635625075e-08, + "logits/chosen": -2.3948326110839844, + "logits/rejected": -2.368168830871582, + "logps/chosen": -151.87503051757812, + "logps/rejected": -185.233154296875, + "loss": 0.5835, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.9678211212158203, + "rewards/margins": 0.35295677185058594, + "rewards/rejected": -1.3207777738571167, + "step": 9820 + }, + { + "epoch": 1.693659545141282, + "grad_norm": 25.736608505249023, + "learning_rate": 4.7464957984501324e-08, + "logits/chosen": -2.4210550785064697, + "logits/rejected": -2.412477970123291, + "logps/chosen": -163.955078125, + "logps/rejected": -183.9481964111328, + "loss": 0.6479, + "rewards/accuracies": 0.53125, + "rewards/chosen": -1.0461199283599854, + "rewards/margins": 0.23021705448627472, + "rewards/rejected": -1.2763371467590332, + "step": 9830 + }, + { + "epoch": 1.6953824948311509, + "grad_norm": 36.30265426635742, + "learning_rate": 4.736484980210865e-08, + "logits/chosen": -2.382336139678955, + "logits/rejected": -2.355818510055542, + "logps/chosen": -155.4736328125, + "logps/rejected": -182.4906463623047, + "loss": 0.6175, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -0.9920843839645386, + "rewards/margins": 0.30391108989715576, + "rewards/rejected": -1.2959954738616943, + "step": 9840 + }, + { + "epoch": 1.69710544452102, + "grad_norm": 26.407812118530273, + "learning_rate": 4.726475221144791e-08, + "logits/chosen": -2.3951416015625, + "logits/rejected": -2.3838553428649902, + "logps/chosen": -149.45062255859375, + "logps/rejected": -173.43222045898438, + "loss": 0.5941, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -0.930752158164978, + "rewards/margins": 0.292227566242218, + "rewards/rejected": -1.2229797840118408, + "step": 9850 + }, + { + "epoch": 1.698828394210889, + "grad_norm": 32.601341247558594, + "learning_rate": 4.7164665614851735e-08, + "logits/chosen": -2.4249911308288574, + "logits/rejected": -2.4174537658691406, + "logps/chosen": -166.8317413330078, + "logps/rejected": -176.670166015625, + "loss": 0.675, + "rewards/accuracies": 0.606249988079071, + "rewards/chosen": -1.0959227085113525, + "rewards/margins": 0.1376759260892868, + "rewards/rejected": -1.2335984706878662, + "step": 9860 + }, + { + "epoch": 1.700551343900758, + "grad_norm": 25.09123992919922, + "learning_rate": 4.706459041460853e-08, + "logits/chosen": -2.398160934448242, + "logits/rejected": -2.3736088275909424, + "logps/chosen": -155.81951904296875, + "logps/rejected": -177.24488830566406, + "loss": 0.6168, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -1.021740198135376, + "rewards/margins": 0.25436100363731384, + "rewards/rejected": -1.2761012315750122, + "step": 9870 + }, + { + "epoch": 1.7022742935906272, + "grad_norm": 28.56049346923828, + "learning_rate": 4.69645270129609e-08, + "logits/chosen": -2.3422672748565674, + "logits/rejected": -2.3361213207244873, + "logps/chosen": -157.16632080078125, + "logps/rejected": -182.27499389648438, + "loss": 0.6296, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -1.0430997610092163, + "rewards/margins": 0.23366820812225342, + "rewards/rejected": -1.2767678499221802, + "step": 9880 + }, + { + "epoch": 1.7039972432804962, + "grad_norm": 27.852136611938477, + "learning_rate": 4.686447581210404e-08, + "logits/chosen": -2.3364086151123047, + "logits/rejected": -2.3273282051086426, + "logps/chosen": -157.48739624023438, + "logps/rejected": -184.2283477783203, + "loss": 0.592, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.9968830347061157, + "rewards/margins": 0.3168890178203583, + "rewards/rejected": -1.313772201538086, + "step": 9890 + }, + { + "epoch": 1.7057201929703654, + "grad_norm": 22.739540100097656, + "learning_rate": 4.676443721418408e-08, + "logits/chosen": -2.3864567279815674, + "logits/rejected": -2.3565258979797363, + "logps/chosen": -149.53790283203125, + "logps/rejected": -188.0727081298828, + "loss": 0.553, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -0.9652314186096191, + "rewards/margins": 0.42683133482933044, + "rewards/rejected": -1.3920629024505615, + "step": 9900 + }, + { + "epoch": 1.7074431426602343, + "grad_norm": 28.658220291137695, + "learning_rate": 4.666441162129653e-08, + "logits/chosen": -2.425554037094116, + "logits/rejected": -2.3846659660339355, + "logps/chosen": -166.473876953125, + "logps/rejected": -183.8915252685547, + "loss": 0.6209, + "rewards/accuracies": 0.606249988079071, + "rewards/chosen": -1.0830899477005005, + "rewards/margins": 0.25999629497528076, + "rewards/rejected": -1.3430863618850708, + "step": 9910 + }, + { + "epoch": 1.7091660923501033, + "grad_norm": 37.005409240722656, + "learning_rate": 4.6564399435484616e-08, + "logits/chosen": -2.4445948600769043, + "logits/rejected": -2.421433448791504, + "logps/chosen": -164.7544403076172, + "logps/rejected": -185.24612426757812, + "loss": 0.6192, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.1069726943969727, + "rewards/margins": 0.2556740641593933, + "rewards/rejected": -1.3626466989517212, + "step": 9920 + }, + { + "epoch": 1.7108890420399723, + "grad_norm": 29.145448684692383, + "learning_rate": 4.646440105873764e-08, + "logits/chosen": -2.3813118934631348, + "logits/rejected": -2.378369092941284, + "logps/chosen": -158.937744140625, + "logps/rejected": -188.14231872558594, + "loss": 0.6131, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -1.056016206741333, + "rewards/margins": 0.28049999475479126, + "rewards/rejected": -1.3365163803100586, + "step": 9930 + }, + { + "epoch": 1.7126119917298415, + "grad_norm": 24.187950134277344, + "learning_rate": 4.636441689298945e-08, + "logits/chosen": -2.4301557540893555, + "logits/rejected": -2.422311305999756, + "logps/chosen": -161.22079467773438, + "logps/rejected": -188.21884155273438, + "loss": 0.6328, + "rewards/accuracies": 0.65625, + "rewards/chosen": -1.0536234378814697, + "rewards/margins": 0.22920770943164825, + "rewards/rejected": -1.282831072807312, + "step": 9940 + }, + { + "epoch": 1.7143349414197107, + "grad_norm": 41.217735290527344, + "learning_rate": 4.626444734011674e-08, + "logits/chosen": -2.3997209072113037, + "logits/rejected": -2.367011308670044, + "logps/chosen": -166.6905059814453, + "logps/rejected": -196.6645965576172, + "loss": 0.5842, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -1.1190987825393677, + "rewards/margins": 0.3476200997829437, + "rewards/rejected": -1.4667189121246338, + "step": 9950 + }, + { + "epoch": 1.7160578911095796, + "grad_norm": 40.069129943847656, + "learning_rate": 4.6164492801937516e-08, + "logits/chosen": -2.428872585296631, + "logits/rejected": -2.413508892059326, + "logps/chosen": -171.06387329101562, + "logps/rejected": -193.9467010498047, + "loss": 0.6359, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -1.1686700582504272, + "rewards/margins": 0.2308274209499359, + "rewards/rejected": -1.399497628211975, + "step": 9960 + }, + { + "epoch": 1.7177808407994486, + "grad_norm": 53.755035400390625, + "learning_rate": 4.606455368020934e-08, + "logits/chosen": -2.419379711151123, + "logits/rejected": -2.4056499004364014, + "logps/chosen": -164.5887451171875, + "logps/rejected": -188.0942840576172, + "loss": 0.6267, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.1215044260025024, + "rewards/margins": 0.2510332465171814, + "rewards/rejected": -1.3725377321243286, + "step": 9970 + }, + { + "epoch": 1.7195037904893176, + "grad_norm": 36.47673416137695, + "learning_rate": 4.59646303766279e-08, + "logits/chosen": -2.3453407287597656, + "logits/rejected": -2.328829050064087, + "logps/chosen": -163.23776245117188, + "logps/rejected": -195.60076904296875, + "loss": 0.5966, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -1.1016603708267212, + "rewards/margins": 0.3368353545665741, + "rewards/rejected": -1.4384956359863281, + "step": 9980 + }, + { + "epoch": 1.7212267401791868, + "grad_norm": 39.14576721191406, + "learning_rate": 4.586472329282529e-08, + "logits/chosen": -2.4195375442504883, + "logits/rejected": -2.3854496479034424, + "logps/chosen": -161.21798706054688, + "logps/rejected": -180.47915649414062, + "loss": 0.6193, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -1.0634045600891113, + "rewards/margins": 0.2559213936328888, + "rewards/rejected": -1.3193260431289673, + "step": 9990 + }, + { + "epoch": 1.722949689869056, + "grad_norm": 30.87076759338379, + "learning_rate": 4.576483283036835e-08, + "logits/chosen": -2.41550350189209, + "logits/rejected": -2.3969714641571045, + "logps/chosen": -161.6389923095703, + "logps/rejected": -189.10501098632812, + "loss": 0.6011, + "rewards/accuracies": 0.65625, + "rewards/chosen": -1.047298789024353, + "rewards/margins": 0.2876473367214203, + "rewards/rejected": -1.3349462747573853, + "step": 10000 + }, + { + "epoch": 1.722949689869056, + "eval_logits/chosen": -2.4489173889160156, + "eval_logits/rejected": -2.4395930767059326, + "eval_logps/chosen": -148.1359405517578, + "eval_logps/rejected": -166.54095458984375, + "eval_loss": 0.6517484784126282, + "eval_rewards/accuracies": 0.609897792339325, + "eval_rewards/chosen": -0.8912045955657959, + "eval_rewards/margins": 0.14670883119106293, + "eval_rewards/rejected": -1.03791344165802, + "eval_runtime": 383.0999, + "eval_samples_per_second": 11.235, + "eval_steps_per_second": 1.404, + "step": 10000 + }, + { + "epoch": 1.724672639558925, + "grad_norm": 26.357059478759766, + "learning_rate": 4.566495939075722e-08, + "logits/chosen": -2.4122986793518066, + "logits/rejected": -2.391216278076172, + "logps/chosen": -158.59512329101562, + "logps/rejected": -191.3281707763672, + "loss": 0.5861, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -1.0556198358535767, + "rewards/margins": 0.33377084136009216, + "rewards/rejected": -1.3893907070159912, + "step": 10010 + }, + { + "epoch": 1.7263955892487939, + "grad_norm": 40.59810256958008, + "learning_rate": 4.5565103375423466e-08, + "logits/chosen": -2.356652021408081, + "logits/rejected": -2.3264429569244385, + "logps/chosen": -166.05789184570312, + "logps/rejected": -190.78692626953125, + "loss": 0.6051, + "rewards/accuracies": 0.71875, + "rewards/chosen": -1.1111173629760742, + "rewards/margins": 0.2901898920536041, + "rewards/rejected": -1.4013073444366455, + "step": 10020 + }, + { + "epoch": 1.7281185389386629, + "grad_norm": 35.417301177978516, + "learning_rate": 4.546526518572878e-08, + "logits/chosen": -2.358099937438965, + "logits/rejected": -2.3320469856262207, + "logps/chosen": -171.77035522460938, + "logps/rejected": -183.01419067382812, + "loss": 0.655, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -1.1348702907562256, + "rewards/margins": 0.18133307993412018, + "rewards/rejected": -1.3162034749984741, + "step": 10030 + }, + { + "epoch": 1.729841488628532, + "grad_norm": 24.3604736328125, + "learning_rate": 4.5365445222963096e-08, + "logits/chosen": -2.4755070209503174, + "logits/rejected": -2.458996534347534, + "logps/chosen": -166.288330078125, + "logps/rejected": -194.246337890625, + "loss": 0.6074, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -1.1001951694488525, + "rewards/margins": 0.30437546968460083, + "rewards/rejected": -1.4045706987380981, + "step": 10040 + }, + { + "epoch": 1.7315644383184012, + "grad_norm": 30.295141220092773, + "learning_rate": 4.5265643888343146e-08, + "logits/chosen": -2.391026735305786, + "logits/rejected": -2.39094614982605, + "logps/chosen": -164.63150024414062, + "logps/rejected": -180.1056671142578, + "loss": 0.6687, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": -1.0971840620040894, + "rewards/margins": 0.1522771418094635, + "rewards/rejected": -1.2494614124298096, + "step": 10050 + }, + { + "epoch": 1.7332873880082702, + "grad_norm": 40.7687873840332, + "learning_rate": 4.516586158301074e-08, + "logits/chosen": -2.3803515434265137, + "logits/rejected": -2.3743226528167725, + "logps/chosen": -152.97393798828125, + "logps/rejected": -176.10609436035156, + "loss": 0.6433, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -1.0329920053482056, + "rewards/margins": 0.20597794651985168, + "rewards/rejected": -1.2389700412750244, + "step": 10060 + }, + { + "epoch": 1.7350103376981392, + "grad_norm": 26.10736846923828, + "learning_rate": 4.506609870803122e-08, + "logits/chosen": -2.3440957069396973, + "logits/rejected": -2.3347837924957275, + "logps/chosen": -144.05136108398438, + "logps/rejected": -176.00039672851562, + "loss": 0.5923, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -0.9066449403762817, + "rewards/margins": 0.3289529085159302, + "rewards/rejected": -1.235597848892212, + "step": 10070 + }, + { + "epoch": 1.7367332873880081, + "grad_norm": 38.685455322265625, + "learning_rate": 4.4966355664391856e-08, + "logits/chosen": -2.416966676712036, + "logits/rejected": -2.3993144035339355, + "logps/chosen": -150.76454162597656, + "logps/rejected": -170.0929718017578, + "loss": 0.6309, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -0.9308739900588989, + "rewards/margins": 0.2240399420261383, + "rewards/rejected": -1.1549139022827148, + "step": 10080 + }, + { + "epoch": 1.7384562370778773, + "grad_norm": 29.607315063476562, + "learning_rate": 4.486663285300019e-08, + "logits/chosen": -2.469937324523926, + "logits/rejected": -2.447589635848999, + "logps/chosen": -142.93031311035156, + "logps/rejected": -175.0164031982422, + "loss": 0.5962, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.8724008798599243, + "rewards/margins": 0.3072863221168518, + "rewards/rejected": -1.179687261581421, + "step": 10090 + }, + { + "epoch": 1.7401791867677465, + "grad_norm": 34.174522399902344, + "learning_rate": 4.4766930674682446e-08, + "logits/chosen": -2.4112601280212402, + "logits/rejected": -2.3972017765045166, + "logps/chosen": -152.28091430664062, + "logps/rejected": -177.61593627929688, + "loss": 0.6097, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.9579811096191406, + "rewards/margins": 0.28297972679138184, + "rewards/rejected": -1.2409610748291016, + "step": 10100 + }, + { + "epoch": 1.7419021364576155, + "grad_norm": 30.270769119262695, + "learning_rate": 4.4667249530181866e-08, + "logits/chosen": -2.4306607246398926, + "logits/rejected": -2.427654981613159, + "logps/chosen": -157.984130859375, + "logps/rejected": -193.0686798095703, + "loss": 0.5951, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -1.0728533267974854, + "rewards/margins": 0.32097524404525757, + "rewards/rejected": -1.3938283920288086, + "step": 10110 + }, + { + "epoch": 1.7436250861474845, + "grad_norm": 34.1330451965332, + "learning_rate": 4.456758982015724e-08, + "logits/chosen": -2.393573045730591, + "logits/rejected": -2.3679986000061035, + "logps/chosen": -168.1916961669922, + "logps/rejected": -188.971923828125, + "loss": 0.6195, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -1.1371654272079468, + "rewards/margins": 0.2622528374195099, + "rewards/rejected": -1.3994182348251343, + "step": 10120 + }, + { + "epoch": 1.7453480358373534, + "grad_norm": 31.97394371032715, + "learning_rate": 4.446795194518113e-08, + "logits/chosen": -2.4118752479553223, + "logits/rejected": -2.3918135166168213, + "logps/chosen": -158.6249542236328, + "logps/rejected": -191.27536010742188, + "loss": 0.5906, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -1.039110541343689, + "rewards/margins": 0.34039172530174255, + "rewards/rejected": -1.379502296447754, + "step": 10130 + }, + { + "epoch": 1.7470709855272226, + "grad_norm": 31.597654342651367, + "learning_rate": 4.436833630573837e-08, + "logits/chosen": -2.389570951461792, + "logits/rejected": -2.3521625995635986, + "logps/chosen": -176.93966674804688, + "logps/rejected": -198.84036254882812, + "loss": 0.6193, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -1.1528842449188232, + "rewards/margins": 0.29966822266578674, + "rewards/rejected": -1.4525524377822876, + "step": 10140 + }, + { + "epoch": 1.7487939352170918, + "grad_norm": 29.19540786743164, + "learning_rate": 4.4268743302224405e-08, + "logits/chosen": -2.3549156188964844, + "logits/rejected": -2.3334720134735107, + "logps/chosen": -168.19863891601562, + "logps/rejected": -202.30337524414062, + "loss": 0.6053, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.1256818771362305, + "rewards/margins": 0.34564125537872314, + "rewards/rejected": -1.471323013305664, + "step": 10150 + }, + { + "epoch": 1.7505168849069608, + "grad_norm": 29.48337173461914, + "learning_rate": 4.416917333494369e-08, + "logits/chosen": -2.362104892730713, + "logits/rejected": -2.3388655185699463, + "logps/chosen": -164.33436584472656, + "logps/rejected": -200.4606475830078, + "loss": 0.5722, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": -1.1089446544647217, + "rewards/margins": 0.3804648816585541, + "rewards/rejected": -1.4894095659255981, + "step": 10160 + }, + { + "epoch": 1.7522398345968297, + "grad_norm": 29.898658752441406, + "learning_rate": 4.406962680410812e-08, + "logits/chosen": -2.338603973388672, + "logits/rejected": -2.3241982460021973, + "logps/chosen": -176.6895294189453, + "logps/rejected": -209.5714569091797, + "loss": 0.5924, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -1.2172462940216064, + "rewards/margins": 0.34438905119895935, + "rewards/rejected": -1.5616354942321777, + "step": 10170 + }, + { + "epoch": 1.7539627842866987, + "grad_norm": 25.77763557434082, + "learning_rate": 4.3970104109835374e-08, + "logits/chosen": -2.3041019439697266, + "logits/rejected": -2.2791824340820312, + "logps/chosen": -179.14761352539062, + "logps/rejected": -217.32827758789062, + "loss": 0.5787, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -1.2611709833145142, + "rewards/margins": 0.3846471607685089, + "rewards/rejected": -1.6458181142807007, + "step": 10180 + }, + { + "epoch": 1.755685733976568, + "grad_norm": 31.118616104125977, + "learning_rate": 4.387060565214732e-08, + "logits/chosen": -2.3026082515716553, + "logits/rejected": -2.278801202774048, + "logps/chosen": -174.61709594726562, + "logps/rejected": -214.9127960205078, + "loss": 0.5687, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -1.2221773862838745, + "rewards/margins": 0.4176904261112213, + "rewards/rejected": -1.6398680210113525, + "step": 10190 + }, + { + "epoch": 1.757408683666437, + "grad_norm": 30.557296752929688, + "learning_rate": 4.3771131830968386e-08, + "logits/chosen": -2.3609557151794434, + "logits/rejected": -2.3385212421417236, + "logps/chosen": -182.12051391601562, + "logps/rejected": -215.4132537841797, + "loss": 0.5951, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -1.2877042293548584, + "rewards/margins": 0.3666505813598633, + "rewards/rejected": -1.6543548107147217, + "step": 10200 + }, + { + "epoch": 1.759131633356306, + "grad_norm": 31.58578109741211, + "learning_rate": 4.367168304612399e-08, + "logits/chosen": -2.343324899673462, + "logits/rejected": -2.330151081085205, + "logps/chosen": -192.5848846435547, + "logps/rejected": -229.7525177001953, + "loss": 0.5968, + "rewards/accuracies": 0.65625, + "rewards/chosen": -1.3690006732940674, + "rewards/margins": 0.37162265181541443, + "rewards/rejected": -1.7406232357025146, + "step": 10210 + }, + { + "epoch": 1.760854583046175, + "grad_norm": 30.243844985961914, + "learning_rate": 4.3572259697338966e-08, + "logits/chosen": -2.3129944801330566, + "logits/rejected": -2.296821117401123, + "logps/chosen": -175.6407928466797, + "logps/rejected": -207.247314453125, + "loss": 0.6053, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -1.261752724647522, + "rewards/margins": 0.308516263961792, + "rewards/rejected": -1.5702688694000244, + "step": 10220 + }, + { + "epoch": 1.762577532736044, + "grad_norm": 33.17268753051758, + "learning_rate": 4.347286218423585e-08, + "logits/chosen": -2.2974283695220947, + "logits/rejected": -2.2750775814056396, + "logps/chosen": -179.20205688476562, + "logps/rejected": -197.1276092529297, + "loss": 0.6486, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.247676134109497, + "rewards/margins": 0.21209442615509033, + "rewards/rejected": -1.4597707986831665, + "step": 10230 + }, + { + "epoch": 1.7643004824259132, + "grad_norm": 31.049867630004883, + "learning_rate": 4.337349090633335e-08, + "logits/chosen": -2.3183791637420654, + "logits/rejected": -2.2915894985198975, + "logps/chosen": -174.53001403808594, + "logps/rejected": -216.4149627685547, + "loss": 0.571, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -1.2244672775268555, + "rewards/margins": 0.42240673303604126, + "rewards/rejected": -1.6468738317489624, + "step": 10240 + }, + { + "epoch": 1.7660234321157822, + "grad_norm": 33.40523910522461, + "learning_rate": 4.327414626304473e-08, + "logits/chosen": -2.3828606605529785, + "logits/rejected": -2.345085859298706, + "logps/chosen": -175.66213989257812, + "logps/rejected": -203.55291748046875, + "loss": 0.5824, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.1799122095108032, + "rewards/margins": 0.3656123876571655, + "rewards/rejected": -1.5455245971679688, + "step": 10250 + }, + { + "epoch": 1.7677463818056514, + "grad_norm": 28.497297286987305, + "learning_rate": 4.317482865367619e-08, + "logits/chosen": -2.3601796627044678, + "logits/rejected": -2.3613171577453613, + "logps/chosen": -165.05870056152344, + "logps/rejected": -197.8215789794922, + "loss": 0.602, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -1.109321117401123, + "rewards/margins": 0.30615848302841187, + "rewards/rejected": -1.4154794216156006, + "step": 10260 + }, + { + "epoch": 1.7694693314955203, + "grad_norm": 37.638763427734375, + "learning_rate": 4.3075538477425296e-08, + "logits/chosen": -2.334505796432495, + "logits/rejected": -2.318866491317749, + "logps/chosen": -172.26190185546875, + "logps/rejected": -201.54942321777344, + "loss": 0.6051, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.1580668687820435, + "rewards/margins": 0.30520960688591003, + "rewards/rejected": -1.4632765054702759, + "step": 10270 + }, + { + "epoch": 1.7711922811853893, + "grad_norm": 31.05406379699707, + "learning_rate": 4.2976276133379336e-08, + "logits/chosen": -2.3457462787628174, + "logits/rejected": -2.3291707038879395, + "logps/chosen": -174.33682250976562, + "logps/rejected": -188.51695251464844, + "loss": 0.664, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -1.192948579788208, + "rewards/margins": 0.16929271817207336, + "rewards/rejected": -1.362241506576538, + "step": 10280 + }, + { + "epoch": 1.7729152308752585, + "grad_norm": 32.82965850830078, + "learning_rate": 4.2877042020513696e-08, + "logits/chosen": -2.317387580871582, + "logits/rejected": -2.296901226043701, + "logps/chosen": -158.64199829101562, + "logps/rejected": -196.9527130126953, + "loss": 0.6009, + "rewards/accuracies": 0.65625, + "rewards/chosen": -1.0650023221969604, + "rewards/margins": 0.3558632731437683, + "rewards/rejected": -1.420865774154663, + "step": 10290 + }, + { + "epoch": 1.7746381805651275, + "grad_norm": 44.43830871582031, + "learning_rate": 4.2777836537690336e-08, + "logits/chosen": -2.372666358947754, + "logits/rejected": -2.3581936359405518, + "logps/chosen": -182.65878295898438, + "logps/rejected": -198.89340209960938, + "loss": 0.6712, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -1.2641236782073975, + "rewards/margins": 0.18516698479652405, + "rewards/rejected": -1.4492907524108887, + "step": 10300 + }, + { + "epoch": 1.7763611302549966, + "grad_norm": 27.348831176757812, + "learning_rate": 4.26786600836561e-08, + "logits/chosen": -2.3039889335632324, + "logits/rejected": -2.282872200012207, + "logps/chosen": -179.7623291015625, + "logps/rejected": -204.12564086914062, + "loss": 0.5981, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -1.22052001953125, + "rewards/margins": 0.3195902407169342, + "rewards/rejected": -1.5401101112365723, + "step": 10310 + }, + { + "epoch": 1.7780840799448656, + "grad_norm": 29.10089683532715, + "learning_rate": 4.2579513057041225e-08, + "logits/chosen": -2.3530561923980713, + "logits/rejected": -2.3259224891662598, + "logps/chosen": -178.1007843017578, + "logps/rejected": -199.89125061035156, + "loss": 0.6372, + "rewards/accuracies": 0.606249988079071, + "rewards/chosen": -1.1998882293701172, + "rewards/margins": 0.2680136561393738, + "rewards/rejected": -1.4679019451141357, + "step": 10320 + }, + { + "epoch": 1.7798070296347346, + "grad_norm": 39.05019760131836, + "learning_rate": 4.248039585635756e-08, + "logits/chosen": -2.3624677658081055, + "logits/rejected": -2.341622829437256, + "logps/chosen": -167.41891479492188, + "logps/rejected": -196.73715209960938, + "loss": 0.604, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -1.1432722806930542, + "rewards/margins": 0.2914575934410095, + "rewards/rejected": -1.434729814529419, + "step": 10330 + }, + { + "epoch": 1.7815299793246038, + "grad_norm": 38.29509353637695, + "learning_rate": 4.238130887999716e-08, + "logits/chosen": -2.4009697437286377, + "logits/rejected": -2.379375457763672, + "logps/chosen": -160.23373413085938, + "logps/rejected": -186.0135955810547, + "loss": 0.6137, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -1.0323786735534668, + "rewards/margins": 0.2908129394054413, + "rewards/rejected": -1.3231916427612305, + "step": 10340 + }, + { + "epoch": 1.7832529290144727, + "grad_norm": 38.13880157470703, + "learning_rate": 4.228225252623055e-08, + "logits/chosen": -2.432330846786499, + "logits/rejected": -2.408937692642212, + "logps/chosen": -158.62673950195312, + "logps/rejected": -179.88282775878906, + "loss": 0.621, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -1.0324501991271973, + "rewards/margins": 0.2672559320926666, + "rewards/rejected": -1.299706220626831, + "step": 10350 + }, + { + "epoch": 1.784975878704342, + "grad_norm": 23.965024948120117, + "learning_rate": 4.218322719320519e-08, + "logits/chosen": -2.365845203399658, + "logits/rejected": -2.339709758758545, + "logps/chosen": -159.224853515625, + "logps/rejected": -174.7471466064453, + "loss": 0.6421, + "rewards/accuracies": 0.6187499761581421, + "rewards/chosen": -1.0221807956695557, + "rewards/margins": 0.20925001800060272, + "rewards/rejected": -1.2314307689666748, + "step": 10360 + }, + { + "epoch": 1.786698828394211, + "grad_norm": 34.28385543823242, + "learning_rate": 4.208423327894387e-08, + "logits/chosen": -2.2426235675811768, + "logits/rejected": -2.216484546661377, + "logps/chosen": -153.36285400390625, + "logps/rejected": -181.20248413085938, + "loss": 0.5993, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -0.9984307289123535, + "rewards/margins": 0.29507023096084595, + "rewards/rejected": -1.2935011386871338, + "step": 10370 + }, + { + "epoch": 1.7884217780840799, + "grad_norm": 33.906288146972656, + "learning_rate": 4.1985271181343056e-08, + "logits/chosen": -2.350764751434326, + "logits/rejected": -2.337337017059326, + "logps/chosen": -163.58370971679688, + "logps/rejected": -175.857421875, + "loss": 0.689, + "rewards/accuracies": 0.581250011920929, + "rewards/chosen": -1.1080776453018188, + "rewards/margins": 0.13066032528877258, + "rewards/rejected": -1.2387378215789795, + "step": 10380 + }, + { + "epoch": 1.7901447277739488, + "grad_norm": 27.24697494506836, + "learning_rate": 4.188634129817135e-08, + "logits/chosen": -2.3855433464050293, + "logits/rejected": -2.3613743782043457, + "logps/chosen": -153.98219299316406, + "logps/rejected": -177.3594512939453, + "loss": 0.6202, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.9690343141555786, + "rewards/margins": 0.27303311228752136, + "rewards/rejected": -1.2420674562454224, + "step": 10390 + }, + { + "epoch": 1.791867677463818, + "grad_norm": 27.075759887695312, + "learning_rate": 4.178744402706788e-08, + "logits/chosen": -2.415712833404541, + "logits/rejected": -2.400829792022705, + "logps/chosen": -151.54632568359375, + "logps/rejected": -190.5732879638672, + "loss": 0.571, + "rewards/accuracies": 0.7562500238418579, + "rewards/chosen": -0.9909058809280396, + "rewards/margins": 0.3732060194015503, + "rewards/rejected": -1.3641119003295898, + "step": 10400 + }, + { + "epoch": 1.791867677463818, + "eval_logits/chosen": -2.4488589763641357, + "eval_logits/rejected": -2.4401326179504395, + "eval_logps/chosen": -141.355712890625, + "eval_logps/rejected": -159.2781982421875, + "eval_loss": 0.651395857334137, + "eval_rewards/accuracies": 0.6122211813926697, + "eval_rewards/chosen": -0.8234025239944458, + "eval_rewards/margins": 0.14188359677791595, + "eval_rewards/rejected": -0.9652861952781677, + "eval_runtime": 383.1182, + "eval_samples_per_second": 11.234, + "eval_steps_per_second": 1.404, + "step": 10400 + }, + { + "epoch": 1.7935906271536872, + "grad_norm": 30.668682098388672, + "learning_rate": 4.168857976554067e-08, + "logits/chosen": -2.3561768531799316, + "logits/rejected": -2.3268985748291016, + "logps/chosen": -160.31906127929688, + "logps/rejected": -179.61732482910156, + "loss": 0.623, + "rewards/accuracies": 0.65625, + "rewards/chosen": -1.0286571979522705, + "rewards/margins": 0.2432531863451004, + "rewards/rejected": -1.2719104290008545, + "step": 10410 + }, + { + "epoch": 1.7953135768435562, + "grad_norm": 26.499998092651367, + "learning_rate": 4.1589748910965104e-08, + "logits/chosen": -2.3826560974121094, + "logits/rejected": -2.3583431243896484, + "logps/chosen": -157.2075958251953, + "logps/rejected": -185.43310546875, + "loss": 0.6181, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -1.0241360664367676, + "rewards/margins": 0.2945292592048645, + "rewards/rejected": -1.3186652660369873, + "step": 10420 + }, + { + "epoch": 1.7970365265334252, + "grad_norm": 24.107423782348633, + "learning_rate": 4.1490951860582243e-08, + "logits/chosen": -2.4138405323028564, + "logits/rejected": -2.393023729324341, + "logps/chosen": -157.1988525390625, + "logps/rejected": -177.00567626953125, + "loss": 0.6355, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -1.0148719549179077, + "rewards/margins": 0.23055486381053925, + "rewards/rejected": -1.2454270124435425, + "step": 10430 + }, + { + "epoch": 1.7987594762232941, + "grad_norm": 29.232955932617188, + "learning_rate": 4.139218901149731e-08, + "logits/chosen": -2.4301648139953613, + "logits/rejected": -2.4212965965270996, + "logps/chosen": -172.70167541503906, + "logps/rejected": -184.5668487548828, + "loss": 0.6641, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -1.1334761381149292, + "rewards/margins": 0.1488693505525589, + "rewards/rejected": -1.2823455333709717, + "step": 10440 + }, + { + "epoch": 1.8004824259131633, + "grad_norm": 31.3802490234375, + "learning_rate": 4.129346076067802e-08, + "logits/chosen": -2.4010443687438965, + "logits/rejected": -2.389110565185547, + "logps/chosen": -158.57315063476562, + "logps/rejected": -193.9378662109375, + "loss": 0.579, + "rewards/accuracies": 0.768750011920929, + "rewards/chosen": -1.03981614112854, + "rewards/margins": 0.3365381360054016, + "rewards/rejected": -1.3763542175292969, + "step": 10450 + }, + { + "epoch": 1.8022053756030325, + "grad_norm": 34.95161819458008, + "learning_rate": 4.119476750495312e-08, + "logits/chosen": -2.385775327682495, + "logits/rejected": -2.356813907623291, + "logps/chosen": -162.02548217773438, + "logps/rejected": -187.3418426513672, + "loss": 0.6009, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -1.0644725561141968, + "rewards/margins": 0.28172314167022705, + "rewards/rejected": -1.3461956977844238, + "step": 10460 + }, + { + "epoch": 1.8039283252929015, + "grad_norm": 29.766733169555664, + "learning_rate": 4.109610964101054e-08, + "logits/chosen": -2.2834041118621826, + "logits/rejected": -2.2619574069976807, + "logps/chosen": -164.19271850585938, + "logps/rejected": -191.8291015625, + "loss": 0.6049, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -1.1161973476409912, + "rewards/margins": 0.304057240486145, + "rewards/rejected": -1.4202544689178467, + "step": 10470 + }, + { + "epoch": 1.8056512749827704, + "grad_norm": 35.490474700927734, + "learning_rate": 4.099748756539609e-08, + "logits/chosen": -2.372213840484619, + "logits/rejected": -2.3380684852600098, + "logps/chosen": -171.201904296875, + "logps/rejected": -206.79092407226562, + "loss": 0.5709, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -1.1381561756134033, + "rewards/margins": 0.41463032364845276, + "rewards/rejected": -1.5527865886688232, + "step": 10480 + }, + { + "epoch": 1.8073742246726394, + "grad_norm": 26.32746124267578, + "learning_rate": 4.089890167451169e-08, + "logits/chosen": -2.3567185401916504, + "logits/rejected": -2.3334903717041016, + "logps/chosen": -167.50057983398438, + "logps/rejected": -192.45741271972656, + "loss": 0.6156, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -1.1219137907028198, + "rewards/margins": 0.2818216383457184, + "rewards/rejected": -1.4037355184555054, + "step": 10490 + }, + { + "epoch": 1.8090971743625086, + "grad_norm": 35.64101791381836, + "learning_rate": 4.08003523646138e-08, + "logits/chosen": -2.3531155586242676, + "logits/rejected": -2.327070713043213, + "logps/chosen": -172.28802490234375, + "logps/rejected": -210.2729949951172, + "loss": 0.5829, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -1.1850135326385498, + "rewards/margins": 0.37663546204566956, + "rewards/rejected": -1.5616488456726074, + "step": 10500 + }, + { + "epoch": 1.8108201240523778, + "grad_norm": 39.186458587646484, + "learning_rate": 4.070184003181189e-08, + "logits/chosen": -2.3595175743103027, + "logits/rejected": -2.326897144317627, + "logps/chosen": -179.2899627685547, + "logps/rejected": -208.84072875976562, + "loss": 0.5982, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -1.231400728225708, + "rewards/margins": 0.3356294631958008, + "rewards/rejected": -1.5670301914215088, + "step": 10510 + }, + { + "epoch": 1.8125430737422468, + "grad_norm": 42.04875183105469, + "learning_rate": 4.060336507206673e-08, + "logits/chosen": -2.378934383392334, + "logits/rejected": -2.367832899093628, + "logps/chosen": -178.14813232421875, + "logps/rejected": -213.73025512695312, + "loss": 0.6138, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -1.2611725330352783, + "rewards/margins": 0.3409961760044098, + "rewards/rejected": -1.6021686792373657, + "step": 10520 + }, + { + "epoch": 1.8142660234321157, + "grad_norm": 38.262855529785156, + "learning_rate": 4.0504927881188946e-08, + "logits/chosen": -2.319676399230957, + "logits/rejected": -2.29738187789917, + "logps/chosen": -180.23928833007812, + "logps/rejected": -201.06912231445312, + "loss": 0.6395, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.2533966302871704, + "rewards/margins": 0.24199731647968292, + "rewards/rejected": -1.4953938722610474, + "step": 10530 + }, + { + "epoch": 1.8159889731219847, + "grad_norm": 41.87706756591797, + "learning_rate": 4.040652885483733e-08, + "logits/chosen": -2.2700414657592773, + "logits/rejected": -2.2463417053222656, + "logps/chosen": -171.6498565673828, + "logps/rejected": -195.3922882080078, + "loss": 0.6154, + "rewards/accuracies": 0.606249988079071, + "rewards/chosen": -1.1847819089889526, + "rewards/margins": 0.2865777015686035, + "rewards/rejected": -1.4713597297668457, + "step": 10540 + }, + { + "epoch": 1.817711922811854, + "grad_norm": 46.120147705078125, + "learning_rate": 4.0308168388517284e-08, + "logits/chosen": -2.4297916889190674, + "logits/rejected": -2.4190146923065186, + "logps/chosen": -179.94268798828125, + "logps/rejected": -208.4828338623047, + "loss": 0.6259, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -1.2590245008468628, + "rewards/margins": 0.27799472212791443, + "rewards/rejected": -1.5370192527770996, + "step": 10550 + }, + { + "epoch": 1.819434872501723, + "grad_norm": 36.81279754638672, + "learning_rate": 4.020984687757918e-08, + "logits/chosen": -2.3187246322631836, + "logits/rejected": -2.29058837890625, + "logps/chosen": -177.59408569335938, + "logps/rejected": -212.2113800048828, + "loss": 0.5926, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.208798885345459, + "rewards/margins": 0.3812781274318695, + "rewards/rejected": -1.5900771617889404, + "step": 10560 + }, + { + "epoch": 1.821157822191592, + "grad_norm": 37.20123291015625, + "learning_rate": 4.0111564717216845e-08, + "logits/chosen": -2.3557188510894775, + "logits/rejected": -2.3360562324523926, + "logps/chosen": -179.9247589111328, + "logps/rejected": -215.9092254638672, + "loss": 0.5865, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.2344462871551514, + "rewards/margins": 0.3715837001800537, + "rewards/rejected": -1.6060298681259155, + "step": 10570 + }, + { + "epoch": 1.822880771881461, + "grad_norm": 31.85076904296875, + "learning_rate": 4.001332230246597e-08, + "logits/chosen": -2.3521053791046143, + "logits/rejected": -2.327073574066162, + "logps/chosen": -173.4813995361328, + "logps/rejected": -206.79641723632812, + "loss": 0.586, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -1.1977475881576538, + "rewards/margins": 0.34784871339797974, + "rewards/rejected": -1.5455963611602783, + "step": 10580 + }, + { + "epoch": 1.82460372157133, + "grad_norm": 30.5350284576416, + "learning_rate": 3.9915120028202434e-08, + "logits/chosen": -2.3166115283966064, + "logits/rejected": -2.2853188514709473, + "logps/chosen": -181.58470153808594, + "logps/rejected": -203.5180206298828, + "loss": 0.6113, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -1.2494252920150757, + "rewards/margins": 0.2973533272743225, + "rewards/rejected": -1.546778678894043, + "step": 10590 + }, + { + "epoch": 1.8263266712611992, + "grad_norm": 26.205764770507812, + "learning_rate": 3.9816958289140836e-08, + "logits/chosen": -2.386885404586792, + "logits/rejected": -2.3785994052886963, + "logps/chosen": -169.0087890625, + "logps/rejected": -195.49234008789062, + "loss": 0.6327, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.156529188156128, + "rewards/margins": 0.2596370577812195, + "rewards/rejected": -1.4161661863327026, + "step": 10600 + }, + { + "epoch": 1.8280496209510684, + "grad_norm": 40.030616760253906, + "learning_rate": 3.971883747983278e-08, + "logits/chosen": -2.3261427879333496, + "logits/rejected": -2.316303014755249, + "logps/chosen": -176.147705078125, + "logps/rejected": -204.08290100097656, + "loss": 0.6211, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.2405706644058228, + "rewards/margins": 0.28227168321609497, + "rewards/rejected": -1.522842288017273, + "step": 10610 + }, + { + "epoch": 1.8297725706409373, + "grad_norm": 32.2052116394043, + "learning_rate": 3.9620757994665383e-08, + "logits/chosen": -2.2571442127227783, + "logits/rejected": -2.2298996448516846, + "logps/chosen": -172.920654296875, + "logps/rejected": -204.323974609375, + "loss": 0.5915, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -1.1804695129394531, + "rewards/margins": 0.3475439250469208, + "rewards/rejected": -1.5280134677886963, + "step": 10620 + }, + { + "epoch": 1.8314955203308063, + "grad_norm": 32.063472747802734, + "learning_rate": 3.952272022785971e-08, + "logits/chosen": -2.347079038619995, + "logits/rejected": -2.3194215297698975, + "logps/chosen": -169.39932250976562, + "logps/rejected": -202.79177856445312, + "loss": 0.5969, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.138787031173706, + "rewards/margins": 0.3578266203403473, + "rewards/rejected": -1.4966135025024414, + "step": 10630 + }, + { + "epoch": 1.8332184700206753, + "grad_norm": 61.603126525878906, + "learning_rate": 3.9424724573469094e-08, + "logits/chosen": -2.374408483505249, + "logits/rejected": -2.3449018001556396, + "logps/chosen": -172.2761993408203, + "logps/rejected": -200.76254272460938, + "loss": 0.6058, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -1.1696155071258545, + "rewards/margins": 0.32298147678375244, + "rewards/rejected": -1.4925968647003174, + "step": 10640 + }, + { + "epoch": 1.8349414197105445, + "grad_norm": 31.435705184936523, + "learning_rate": 3.9326771425377586e-08, + "logits/chosen": -2.364908218383789, + "logits/rejected": -2.3407516479492188, + "logps/chosen": -182.14035034179688, + "logps/rejected": -222.3705291748047, + "loss": 0.5796, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -1.2733443975448608, + "rewards/margins": 0.422320693731308, + "rewards/rejected": -1.6956650018692017, + "step": 10650 + }, + { + "epoch": 1.8366643694004137, + "grad_norm": 35.018035888671875, + "learning_rate": 3.9228861177298434e-08, + "logits/chosen": -2.2953758239746094, + "logits/rejected": -2.2792255878448486, + "logps/chosen": -186.21810913085938, + "logps/rejected": -211.9817657470703, + "loss": 0.6269, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -1.3122917413711548, + "rewards/margins": 0.26357850432395935, + "rewards/rejected": -1.5758702754974365, + "step": 10660 + }, + { + "epoch": 1.8383873190902826, + "grad_norm": 26.195331573486328, + "learning_rate": 3.913099422277242e-08, + "logits/chosen": -2.326352596282959, + "logits/rejected": -2.3013761043548584, + "logps/chosen": -184.91539001464844, + "logps/rejected": -218.487060546875, + "loss": 0.5974, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -1.3113325834274292, + "rewards/margins": 0.3626203238964081, + "rewards/rejected": -1.6739528179168701, + "step": 10670 + }, + { + "epoch": 1.8401102687801516, + "grad_norm": 31.851932525634766, + "learning_rate": 3.903317095516634e-08, + "logits/chosen": -2.346625804901123, + "logits/rejected": -2.3183517456054688, + "logps/chosen": -183.88546752929688, + "logps/rejected": -205.15170288085938, + "loss": 0.6126, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -1.263718843460083, + "rewards/margins": 0.29013028740882874, + "rewards/rejected": -1.5538489818572998, + "step": 10680 + }, + { + "epoch": 1.8418332184700206, + "grad_norm": 40.034358978271484, + "learning_rate": 3.893539176767138e-08, + "logits/chosen": -2.309453010559082, + "logits/rejected": -2.2947006225585938, + "logps/chosen": -185.39358520507812, + "logps/rejected": -224.47067260742188, + "loss": 0.5827, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -1.3045397996902466, + "rewards/margins": 0.3803395628929138, + "rewards/rejected": -1.6848793029785156, + "step": 10690 + }, + { + "epoch": 1.8435561681598898, + "grad_norm": 50.86007308959961, + "learning_rate": 3.8837657053301533e-08, + "logits/chosen": -2.3584494590759277, + "logits/rejected": -2.3197760581970215, + "logps/chosen": -180.21823120117188, + "logps/rejected": -205.2100830078125, + "loss": 0.5955, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -1.23667311668396, + "rewards/margins": 0.3189604878425598, + "rewards/rejected": -1.555633783340454, + "step": 10700 + }, + { + "epoch": 1.8452791178497587, + "grad_norm": 30.94768524169922, + "learning_rate": 3.873996720489205e-08, + "logits/chosen": -2.308336019515991, + "logits/rejected": -2.280362606048584, + "logps/chosen": -176.10208129882812, + "logps/rejected": -202.3173065185547, + "loss": 0.5951, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.2233304977416992, + "rewards/margins": 0.31216123700141907, + "rewards/rejected": -1.535491704940796, + "step": 10710 + }, + { + "epoch": 1.847002067539628, + "grad_norm": 35.156558990478516, + "learning_rate": 3.864232261509787e-08, + "logits/chosen": -2.302485942840576, + "logits/rejected": -2.2754483222961426, + "logps/chosen": -184.95298767089844, + "logps/rejected": -211.0207061767578, + "loss": 0.6317, + "rewards/accuracies": 0.65625, + "rewards/chosen": -1.316679835319519, + "rewards/margins": 0.2793746590614319, + "rewards/rejected": -1.5960544347763062, + "step": 10720 + }, + { + "epoch": 1.848725017229497, + "grad_norm": 29.506084442138672, + "learning_rate": 3.8544723676392e-08, + "logits/chosen": -2.4065823554992676, + "logits/rejected": -2.3675832748413086, + "logps/chosen": -179.6064453125, + "logps/rejected": -211.91336059570312, + "loss": 0.5788, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -1.2562201023101807, + "rewards/margins": 0.36423414945602417, + "rewards/rejected": -1.62045419216156, + "step": 10730 + }, + { + "epoch": 1.8504479669193659, + "grad_norm": 42.23896789550781, + "learning_rate": 3.844717078106394e-08, + "logits/chosen": -2.29411244392395, + "logits/rejected": -2.273477077484131, + "logps/chosen": -187.98069763183594, + "logps/rejected": -210.1696319580078, + "loss": 0.6447, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -1.3325438499450684, + "rewards/margins": 0.22396263480186462, + "rewards/rejected": -1.5565065145492554, + "step": 10740 + }, + { + "epoch": 1.852170916609235, + "grad_norm": 33.77505874633789, + "learning_rate": 3.8349664321218135e-08, + "logits/chosen": -2.294753313064575, + "logits/rejected": -2.2612767219543457, + "logps/chosen": -169.0750274658203, + "logps/rejected": -208.6610107421875, + "loss": 0.5765, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -1.1630382537841797, + "rewards/margins": 0.40251216292381287, + "rewards/rejected": -1.565550446510315, + "step": 10750 + }, + { + "epoch": 1.853893866299104, + "grad_norm": 38.18898010253906, + "learning_rate": 3.82522046887724e-08, + "logits/chosen": -2.3010048866271973, + "logits/rejected": -2.277493715286255, + "logps/chosen": -179.2954559326172, + "logps/rejected": -204.9003143310547, + "loss": 0.623, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -1.2467565536499023, + "rewards/margins": 0.2810923159122467, + "rewards/rejected": -1.5278488397598267, + "step": 10760 + }, + { + "epoch": 1.8556168159889732, + "grad_norm": 33.77649688720703, + "learning_rate": 3.815479227545633e-08, + "logits/chosen": -2.295396327972412, + "logits/rejected": -2.279035806655884, + "logps/chosen": -181.14846801757812, + "logps/rejected": -210.6282958984375, + "loss": 0.598, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -1.2278902530670166, + "rewards/margins": 0.3429439067840576, + "rewards/rejected": -1.5708341598510742, + "step": 10770 + }, + { + "epoch": 1.8573397656788422, + "grad_norm": 31.69585418701172, + "learning_rate": 3.8057427472809736e-08, + "logits/chosen": -2.4099316596984863, + "logits/rejected": -2.3927342891693115, + "logps/chosen": -173.10000610351562, + "logps/rejected": -204.47640991210938, + "loss": 0.6153, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.1872986555099487, + "rewards/margins": 0.30703839659690857, + "rewards/rejected": -1.4943370819091797, + "step": 10780 + }, + { + "epoch": 1.8590627153687111, + "grad_norm": 37.59237289428711, + "learning_rate": 3.796011067218101e-08, + "logits/chosen": -2.436169147491455, + "logits/rejected": -2.4094064235687256, + "logps/chosen": -167.60398864746094, + "logps/rejected": -195.60458374023438, + "loss": 0.6161, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -1.142968773841858, + "rewards/margins": 0.29017987847328186, + "rewards/rejected": -1.4331486225128174, + "step": 10790 + }, + { + "epoch": 1.8607856650585803, + "grad_norm": 29.355329513549805, + "learning_rate": 3.786284226472565e-08, + "logits/chosen": -2.3927254676818848, + "logits/rejected": -2.368919849395752, + "logps/chosen": -168.04922485351562, + "logps/rejected": -204.6519317626953, + "loss": 0.5889, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -1.1452404260635376, + "rewards/margins": 0.37039321660995483, + "rewards/rejected": -1.5156338214874268, + "step": 10800 + }, + { + "epoch": 1.8607856650585803, + "eval_logits/chosen": -2.403923511505127, + "eval_logits/rejected": -2.393235445022583, + "eval_logps/chosen": -160.73316955566406, + "eval_logps/rejected": -180.2567596435547, + "eval_loss": 0.6505710482597351, + "eval_rewards/accuracies": 0.6054832935333252, + "eval_rewards/chosen": -1.017176866531372, + "eval_rewards/margins": 0.15789496898651123, + "eval_rewards/rejected": -1.1750717163085938, + "eval_runtime": 384.6021, + "eval_samples_per_second": 11.191, + "eval_steps_per_second": 1.399, + "step": 10800 + }, + { + "epoch": 1.8625086147484493, + "grad_norm": 28.493104934692383, + "learning_rate": 3.776562264140464e-08, + "logits/chosen": -2.365048408508301, + "logits/rejected": -2.3278393745422363, + "logps/chosen": -180.26234436035156, + "logps/rejected": -199.35757446289062, + "loss": 0.6182, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -1.215912103652954, + "rewards/margins": 0.285257488489151, + "rewards/rejected": -1.5011695623397827, + "step": 10810 + }, + { + "epoch": 1.8642315644383185, + "grad_norm": 33.84832763671875, + "learning_rate": 3.766845219298291e-08, + "logits/chosen": -2.3179924488067627, + "logits/rejected": -2.294447422027588, + "logps/chosen": -167.72108459472656, + "logps/rejected": -197.6214141845703, + "loss": 0.6103, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.1377894878387451, + "rewards/margins": 0.3336928188800812, + "rewards/rejected": -1.471482515335083, + "step": 10820 + }, + { + "epoch": 1.8659545141281875, + "grad_norm": 32.825592041015625, + "learning_rate": 3.757133131002764e-08, + "logits/chosen": -2.337118148803711, + "logits/rejected": -2.3134689331054688, + "logps/chosen": -176.63125610351562, + "logps/rejected": -202.58164978027344, + "loss": 0.6203, + "rewards/accuracies": 0.6187499761581421, + "rewards/chosen": -1.2352012395858765, + "rewards/margins": 0.26735469698905945, + "rewards/rejected": -1.5025558471679688, + "step": 10830 + }, + { + "epoch": 1.8676774638180564, + "grad_norm": 25.85580062866211, + "learning_rate": 3.747426038290689e-08, + "logits/chosen": -2.346402645111084, + "logits/rejected": -2.325465440750122, + "logps/chosen": -167.85498046875, + "logps/rejected": -191.32839965820312, + "loss": 0.6291, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -1.1382768154144287, + "rewards/margins": 0.26838475465774536, + "rewards/rejected": -1.4066616296768188, + "step": 10840 + }, + { + "epoch": 1.8694004135079254, + "grad_norm": 30.227493286132812, + "learning_rate": 3.737723980178786e-08, + "logits/chosen": -2.3299098014831543, + "logits/rejected": -2.309762477874756, + "logps/chosen": -159.28550720214844, + "logps/rejected": -191.73582458496094, + "loss": 0.5993, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -1.0782078504562378, + "rewards/margins": 0.2990468144416809, + "rewards/rejected": -1.377254605293274, + "step": 10850 + }, + { + "epoch": 1.8711233631977946, + "grad_norm": 34.90668487548828, + "learning_rate": 3.7280269956635414e-08, + "logits/chosen": -2.394364595413208, + "logits/rejected": -2.3578202724456787, + "logps/chosen": -168.866943359375, + "logps/rejected": -199.4810333251953, + "loss": 0.5919, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -1.1270270347595215, + "rewards/margins": 0.3460743725299835, + "rewards/rejected": -1.4731013774871826, + "step": 10860 + }, + { + "epoch": 1.8728463128876638, + "grad_norm": 25.8541259765625, + "learning_rate": 3.718335123721054e-08, + "logits/chosen": -2.2831482887268066, + "logits/rejected": -2.2722556591033936, + "logps/chosen": -161.58595275878906, + "logps/rejected": -201.04360961914062, + "loss": 0.5987, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -1.107114553451538, + "rewards/margins": 0.34930944442749023, + "rewards/rejected": -1.4564241170883179, + "step": 10870 + }, + { + "epoch": 1.8745692625775328, + "grad_norm": 34.01238250732422, + "learning_rate": 3.708648403306859e-08, + "logits/chosen": -2.3505282402038574, + "logits/rejected": -2.3237061500549316, + "logps/chosen": -172.83595275878906, + "logps/rejected": -201.0712127685547, + "loss": 0.5968, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -1.1779439449310303, + "rewards/margins": 0.3129883408546448, + "rewards/rejected": -1.4909324645996094, + "step": 10880 + }, + { + "epoch": 1.8762922122674017, + "grad_norm": 38.18024444580078, + "learning_rate": 3.698966873355802e-08, + "logits/chosen": -2.374328374862671, + "logits/rejected": -2.3627209663391113, + "logps/chosen": -175.97042846679688, + "logps/rejected": -203.05239868164062, + "loss": 0.6211, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -1.2167003154754639, + "rewards/margins": 0.27208462357521057, + "rewards/rejected": -1.4887850284576416, + "step": 10890 + }, + { + "epoch": 1.8780151619572707, + "grad_norm": 31.21358299255371, + "learning_rate": 3.6892905727818544e-08, + "logits/chosen": -2.4051666259765625, + "logits/rejected": -2.3722360134124756, + "logps/chosen": -170.09390258789062, + "logps/rejected": -193.1257781982422, + "loss": 0.6076, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -1.1315958499908447, + "rewards/margins": 0.2980080544948578, + "rewards/rejected": -1.429603934288025, + "step": 10900 + }, + { + "epoch": 1.8797381116471399, + "grad_norm": 38.000186920166016, + "learning_rate": 3.679619540477975e-08, + "logits/chosen": -2.3199026584625244, + "logits/rejected": -2.2889769077301025, + "logps/chosen": -169.51327514648438, + "logps/rejected": -195.25967407226562, + "loss": 0.6133, + "rewards/accuracies": 0.606249988079071, + "rewards/chosen": -1.1240684986114502, + "rewards/margins": 0.288606733083725, + "rewards/rejected": -1.412675142288208, + "step": 10910 + }, + { + "epoch": 1.881461061337009, + "grad_norm": 34.62694549560547, + "learning_rate": 3.669953815315943e-08, + "logits/chosen": -2.3156676292419434, + "logits/rejected": -2.292684555053711, + "logps/chosen": -174.4659881591797, + "logps/rejected": -199.62318420410156, + "loss": 0.6055, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -1.180438756942749, + "rewards/margins": 0.2978615164756775, + "rewards/rejected": -1.4783003330230713, + "step": 10920 + }, + { + "epoch": 1.883184011026878, + "grad_norm": 34.97676086425781, + "learning_rate": 3.6602934361462065e-08, + "logits/chosen": -2.2863287925720215, + "logits/rejected": -2.2612805366516113, + "logps/chosen": -173.59727478027344, + "logps/rejected": -189.28097534179688, + "loss": 0.6444, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -1.1842314004898071, + "rewards/margins": 0.19635871052742004, + "rewards/rejected": -1.3805900812149048, + "step": 10930 + }, + { + "epoch": 1.884906960716747, + "grad_norm": 36.55752944946289, + "learning_rate": 3.6506384417977314e-08, + "logits/chosen": -2.2828993797302246, + "logits/rejected": -2.265763282775879, + "logps/chosen": -173.64022827148438, + "logps/rejected": -195.41806030273438, + "loss": 0.6228, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.1666899919509888, + "rewards/margins": 0.24730460345745087, + "rewards/rejected": -1.413994550704956, + "step": 10940 + }, + { + "epoch": 1.886629910406616, + "grad_norm": 33.55683135986328, + "learning_rate": 3.6409888710778344e-08, + "logits/chosen": -2.331960439682007, + "logits/rejected": -2.317816972732544, + "logps/chosen": -167.28970336914062, + "logps/rejected": -188.5340576171875, + "loss": 0.6268, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -1.1197372674942017, + "rewards/margins": 0.22455672919750214, + "rewards/rejected": -1.3442939519882202, + "step": 10950 + }, + { + "epoch": 1.8883528600964852, + "grad_norm": 36.97398376464844, + "learning_rate": 3.631344762772034e-08, + "logits/chosen": -2.3534226417541504, + "logits/rejected": -2.3306939601898193, + "logps/chosen": -163.12069702148438, + "logps/rejected": -192.68130493164062, + "loss": 0.6123, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -1.0892016887664795, + "rewards/margins": 0.3190400302410126, + "rewards/rejected": -1.4082419872283936, + "step": 10960 + }, + { + "epoch": 1.8900758097863544, + "grad_norm": 30.5578556060791, + "learning_rate": 3.621706155643891e-08, + "logits/chosen": -2.372744083404541, + "logits/rejected": -2.340273380279541, + "logps/chosen": -158.9438018798828, + "logps/rejected": -191.1975555419922, + "loss": 0.5876, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -1.013116478919983, + "rewards/margins": 0.36311542987823486, + "rewards/rejected": -1.3762319087982178, + "step": 10970 + }, + { + "epoch": 1.8917987594762233, + "grad_norm": 50.900856018066406, + "learning_rate": 3.612073088434858e-08, + "logits/chosen": -2.3769946098327637, + "logits/rejected": -2.3548004627227783, + "logps/chosen": -167.50221252441406, + "logps/rejected": -200.83953857421875, + "loss": 0.6065, + "rewards/accuracies": 0.65625, + "rewards/chosen": -1.1152487993240356, + "rewards/margins": 0.34097960591316223, + "rewards/rejected": -1.456228494644165, + "step": 10980 + }, + { + "epoch": 1.8935217091660923, + "grad_norm": 27.747989654541016, + "learning_rate": 3.6024455998641206e-08, + "logits/chosen": -2.3134467601776123, + "logits/rejected": -2.298866033554077, + "logps/chosen": -158.34649658203125, + "logps/rejected": -185.16175842285156, + "loss": 0.6092, + "rewards/accuracies": 0.65625, + "rewards/chosen": -1.0483105182647705, + "rewards/margins": 0.278871089220047, + "rewards/rejected": -1.3271814584732056, + "step": 10990 + }, + { + "epoch": 1.8952446588559613, + "grad_norm": 30.30034637451172, + "learning_rate": 3.592823728628439e-08, + "logits/chosen": -2.4606642723083496, + "logits/rejected": -2.453303575515747, + "logps/chosen": -156.36085510253906, + "logps/rejected": -186.90420532226562, + "loss": 0.618, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -1.0328586101531982, + "rewards/margins": 0.2768515944480896, + "rewards/rejected": -1.309709906578064, + "step": 11000 + }, + { + "epoch": 1.8969676085458305, + "grad_norm": 28.235002517700195, + "learning_rate": 3.5832075134019955e-08, + "logits/chosen": -2.380627155303955, + "logits/rejected": -2.348646640777588, + "logps/chosen": -154.5124053955078, + "logps/rejected": -189.811279296875, + "loss": 0.5734, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -1.005467414855957, + "rewards/margins": 0.38841429352760315, + "rewards/rejected": -1.3938816785812378, + "step": 11010 + }, + { + "epoch": 1.8986905582356997, + "grad_norm": 32.709320068359375, + "learning_rate": 3.573596992836239e-08, + "logits/chosen": -2.406069278717041, + "logits/rejected": -2.386014223098755, + "logps/chosen": -160.51290893554688, + "logps/rejected": -179.46875, + "loss": 0.6359, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -1.0448853969573975, + "rewards/margins": 0.22973811626434326, + "rewards/rejected": -1.2746235132217407, + "step": 11020 + }, + { + "epoch": 1.9004135079255686, + "grad_norm": 34.195552825927734, + "learning_rate": 3.5639922055597306e-08, + "logits/chosen": -2.4023020267486572, + "logits/rejected": -2.3860769271850586, + "logps/chosen": -170.53150939941406, + "logps/rejected": -193.5362548828125, + "loss": 0.6419, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -1.1451653242111206, + "rewards/margins": 0.25867563486099243, + "rewards/rejected": -1.4038410186767578, + "step": 11030 + }, + { + "epoch": 1.9021364576154376, + "grad_norm": 29.43927001953125, + "learning_rate": 3.5543931901779855e-08, + "logits/chosen": -2.412224531173706, + "logits/rejected": -2.393313407897949, + "logps/chosen": -164.941650390625, + "logps/rejected": -190.35043334960938, + "loss": 0.6026, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -1.0708945989608765, + "rewards/margins": 0.28530409932136536, + "rewards/rejected": -1.3561986684799194, + "step": 11040 + }, + { + "epoch": 1.9038594073053066, + "grad_norm": 32.702964782714844, + "learning_rate": 3.544799985273321e-08, + "logits/chosen": -2.363358974456787, + "logits/rejected": -2.34291672706604, + "logps/chosen": -150.8433837890625, + "logps/rejected": -184.0169219970703, + "loss": 0.5946, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -0.9849146008491516, + "rewards/margins": 0.3147234320640564, + "rewards/rejected": -1.2996381521224976, + "step": 11050 + }, + { + "epoch": 1.9055823569951758, + "grad_norm": 32.199180603027344, + "learning_rate": 3.535212629404697e-08, + "logits/chosen": -2.3609559535980225, + "logits/rejected": -2.3134467601776123, + "logps/chosen": -164.59950256347656, + "logps/rejected": -197.69320678710938, + "loss": 0.5852, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -1.0920616388320923, + "rewards/margins": 0.37384092807769775, + "rewards/rejected": -1.46590256690979, + "step": 11060 + }, + { + "epoch": 1.907305306685045, + "grad_norm": 32.38897705078125, + "learning_rate": 3.525631161107564e-08, + "logits/chosen": -2.3948540687561035, + "logits/rejected": -2.350449562072754, + "logps/chosen": -169.53561401367188, + "logps/rejected": -205.30337524414062, + "loss": 0.5558, + "rewards/accuracies": 0.71875, + "rewards/chosen": -1.148709774017334, + "rewards/margins": 0.4157761037349701, + "rewards/rejected": -1.5644859075546265, + "step": 11070 + }, + { + "epoch": 1.909028256374914, + "grad_norm": 25.04302978515625, + "learning_rate": 3.516055618893712e-08, + "logits/chosen": -2.3484442234039307, + "logits/rejected": -2.3237853050231934, + "logps/chosen": -187.7406005859375, + "logps/rejected": -216.2569122314453, + "loss": 0.5911, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -1.306531548500061, + "rewards/margins": 0.3401932120323181, + "rewards/rejected": -1.6467249393463135, + "step": 11080 + }, + { + "epoch": 1.9107512060647829, + "grad_norm": 27.777585983276367, + "learning_rate": 3.50648604125111e-08, + "logits/chosen": -2.333702564239502, + "logits/rejected": -2.307685136795044, + "logps/chosen": -190.0394287109375, + "logps/rejected": -215.70840454101562, + "loss": 0.6033, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -1.319045066833496, + "rewards/margins": 0.31661438941955566, + "rewards/rejected": -1.6356592178344727, + "step": 11090 + }, + { + "epoch": 1.9124741557546519, + "grad_norm": 31.222261428833008, + "learning_rate": 3.496922466643748e-08, + "logits/chosen": -2.2713112831115723, + "logits/rejected": -2.255934238433838, + "logps/chosen": -176.26388549804688, + "logps/rejected": -202.74990844726562, + "loss": 0.6314, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -1.2430853843688965, + "rewards/margins": 0.2581595778465271, + "rewards/rejected": -1.5012450218200684, + "step": 11100 + }, + { + "epoch": 1.914197105444521, + "grad_norm": 29.14847183227539, + "learning_rate": 3.487364933511494e-08, + "logits/chosen": -2.318223476409912, + "logits/rejected": -2.2866435050964355, + "logps/chosen": -188.31454467773438, + "logps/rejected": -228.31185913085938, + "loss": 0.5999, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -1.3372859954833984, + "rewards/margins": 0.4245668947696686, + "rewards/rejected": -1.7618528604507446, + "step": 11110 + }, + { + "epoch": 1.9159200551343902, + "grad_norm": 28.7227783203125, + "learning_rate": 3.4778134802699274e-08, + "logits/chosen": -2.401853084564209, + "logits/rejected": -2.3755974769592285, + "logps/chosen": -185.71531677246094, + "logps/rejected": -211.0960693359375, + "loss": 0.6019, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -1.251940131187439, + "rewards/margins": 0.33058732748031616, + "rewards/rejected": -1.5825273990631104, + "step": 11120 + }, + { + "epoch": 1.9176430048242592, + "grad_norm": 38.03221893310547, + "learning_rate": 3.4682681453101966e-08, + "logits/chosen": -2.3060457706451416, + "logits/rejected": -2.27876615524292, + "logps/chosen": -185.47006225585938, + "logps/rejected": -210.9840087890625, + "loss": 0.6077, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -1.2905194759368896, + "rewards/margins": 0.29180076718330383, + "rewards/rejected": -1.5823204517364502, + "step": 11130 + }, + { + "epoch": 1.9193659545141282, + "grad_norm": 36.275672912597656, + "learning_rate": 3.458728966998853e-08, + "logits/chosen": -2.2617485523223877, + "logits/rejected": -2.241072416305542, + "logps/chosen": -178.1785430908203, + "logps/rejected": -203.6383819580078, + "loss": 0.6146, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -1.2262318134307861, + "rewards/margins": 0.2985805869102478, + "rewards/rejected": -1.5248124599456787, + "step": 11140 + }, + { + "epoch": 1.9210889042039971, + "grad_norm": 27.28822135925293, + "learning_rate": 3.4491959836777025e-08, + "logits/chosen": -2.3436708450317383, + "logits/rejected": -2.3321614265441895, + "logps/chosen": -181.6899871826172, + "logps/rejected": -201.81768798828125, + "loss": 0.6403, + "rewards/accuracies": 0.65625, + "rewards/chosen": -1.2293510437011719, + "rewards/margins": 0.23338110744953156, + "rewards/rejected": -1.4627320766448975, + "step": 11150 + }, + { + "epoch": 1.9228118538938663, + "grad_norm": 35.84111785888672, + "learning_rate": 3.439669233663651e-08, + "logits/chosen": -2.3769030570983887, + "logits/rejected": -2.3559274673461914, + "logps/chosen": -171.7718963623047, + "logps/rejected": -197.13270568847656, + "loss": 0.6136, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -1.184159517288208, + "rewards/margins": 0.280154824256897, + "rewards/rejected": -1.464314579963684, + "step": 11160 + }, + { + "epoch": 1.9245348035837355, + "grad_norm": 41.14706802368164, + "learning_rate": 3.430148755248552e-08, + "logits/chosen": -2.2820653915405273, + "logits/rejected": -2.2608208656311035, + "logps/chosen": -178.73739624023438, + "logps/rejected": -189.0192108154297, + "loss": 0.6456, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.2014598846435547, + "rewards/margins": 0.20202994346618652, + "rewards/rejected": -1.4034898281097412, + "step": 11170 + }, + { + "epoch": 1.9262577532736045, + "grad_norm": 30.83884048461914, + "learning_rate": 3.4206345866990535e-08, + "logits/chosen": -2.3826446533203125, + "logits/rejected": -2.3630967140197754, + "logps/chosen": -181.05758666992188, + "logps/rejected": -206.005615234375, + "loss": 0.6197, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -1.2589973211288452, + "rewards/margins": 0.29420050978660583, + "rewards/rejected": -1.553197979927063, + "step": 11180 + }, + { + "epoch": 1.9279807029634735, + "grad_norm": 33.7467041015625, + "learning_rate": 3.41112676625643e-08, + "logits/chosen": -2.380197048187256, + "logits/rejected": -2.3536598682403564, + "logps/chosen": -167.689697265625, + "logps/rejected": -200.7379608154297, + "loss": 0.5648, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -1.117105484008789, + "rewards/margins": 0.3807833790779114, + "rewards/rejected": -1.4978888034820557, + "step": 11190 + }, + { + "epoch": 1.9297036526533424, + "grad_norm": 38.58169937133789, + "learning_rate": 3.401625332136455e-08, + "logits/chosen": -2.3757309913635254, + "logits/rejected": -2.3479130268096924, + "logps/chosen": -165.47581481933594, + "logps/rejected": -200.92559814453125, + "loss": 0.5685, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -1.1087088584899902, + "rewards/margins": 0.36637088656425476, + "rewards/rejected": -1.4750797748565674, + "step": 11200 + }, + { + "epoch": 1.9297036526533424, + "eval_logits/chosen": -2.3992388248443604, + "eval_logits/rejected": -2.3886518478393555, + "eval_logps/chosen": -161.57830810546875, + "eval_logps/rejected": -181.8199920654297, + "eval_loss": 0.6485751867294312, + "eval_rewards/accuracies": 0.5992100238800049, + "eval_rewards/chosen": -1.0256284475326538, + "eval_rewards/margins": 0.16507543623447418, + "eval_rewards/rejected": -1.1907037496566772, + "eval_runtime": 384.7254, + "eval_samples_per_second": 11.187, + "eval_steps_per_second": 1.398, + "step": 11200 + }, + { + "epoch": 1.9314266023432116, + "grad_norm": 31.150392532348633, + "learning_rate": 3.3921303225292226e-08, + "logits/chosen": -2.2678802013397217, + "logits/rejected": -2.243900775909424, + "logps/chosen": -170.72007751464844, + "logps/rejected": -208.6854705810547, + "loss": 0.5868, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -1.1942079067230225, + "rewards/margins": 0.3671010732650757, + "rewards/rejected": -1.5613089799880981, + "step": 11210 + }, + { + "epoch": 1.9331495520330806, + "grad_norm": 44.891056060791016, + "learning_rate": 3.382641775599008e-08, + "logits/chosen": -2.316336154937744, + "logits/rejected": -2.302032947540283, + "logps/chosen": -177.46009826660156, + "logps/rejected": -207.8301239013672, + "loss": 0.6361, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -1.2496541738510132, + "rewards/margins": 0.2785920202732086, + "rewards/rejected": -1.5282460451126099, + "step": 11220 + }, + { + "epoch": 1.9348725017229498, + "grad_norm": 30.4825496673584, + "learning_rate": 3.373159729484113e-08, + "logits/chosen": -2.2911577224731445, + "logits/rejected": -2.276029348373413, + "logps/chosen": -193.60427856445312, + "logps/rejected": -210.3373565673828, + "loss": 0.6456, + "rewards/accuracies": 0.6187499761581421, + "rewards/chosen": -1.3614342212677002, + "rewards/margins": 0.2419278621673584, + "rewards/rejected": -1.6033620834350586, + "step": 11230 + }, + { + "epoch": 1.9365954514128187, + "grad_norm": 27.854116439819336, + "learning_rate": 3.363684222296704e-08, + "logits/chosen": -2.3137216567993164, + "logits/rejected": -2.2912824153900146, + "logps/chosen": -181.1165771484375, + "logps/rejected": -202.65650939941406, + "loss": 0.6348, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -1.2578670978546143, + "rewards/margins": 0.24567310512065887, + "rewards/rejected": -1.503540277481079, + "step": 11240 + }, + { + "epoch": 1.9383184011026877, + "grad_norm": 36.37520980834961, + "learning_rate": 3.3542152921226686e-08, + "logits/chosen": -2.33618426322937, + "logits/rejected": -2.308847188949585, + "logps/chosen": -171.78488159179688, + "logps/rejected": -205.62368774414062, + "loss": 0.5829, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -1.1678929328918457, + "rewards/margins": 0.3575848937034607, + "rewards/rejected": -1.5254778861999512, + "step": 11250 + }, + { + "epoch": 1.940041350792557, + "grad_norm": 30.956172943115234, + "learning_rate": 3.3447529770214565e-08, + "logits/chosen": -2.29649019241333, + "logits/rejected": -2.265418291091919, + "logps/chosen": -179.08489990234375, + "logps/rejected": -201.74864196777344, + "loss": 0.6165, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -1.2031055688858032, + "rewards/margins": 0.3095191419124603, + "rewards/rejected": -1.5126248598098755, + "step": 11260 + }, + { + "epoch": 1.9417643004824259, + "grad_norm": 29.66795539855957, + "learning_rate": 3.335297315025935e-08, + "logits/chosen": -2.284562110900879, + "logits/rejected": -2.2554664611816406, + "logps/chosen": -176.65817260742188, + "logps/rejected": -206.97415161132812, + "loss": 0.5787, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -1.2032575607299805, + "rewards/margins": 0.36220937967300415, + "rewards/rejected": -1.5654670000076294, + "step": 11270 + }, + { + "epoch": 1.943487250172295, + "grad_norm": 31.585372924804688, + "learning_rate": 3.325848344142219e-08, + "logits/chosen": -2.338066816329956, + "logits/rejected": -2.3050124645233154, + "logps/chosen": -183.82411193847656, + "logps/rejected": -204.71006774902344, + "loss": 0.6312, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -1.2679367065429688, + "rewards/margins": 0.26346999406814575, + "rewards/rejected": -1.5314067602157593, + "step": 11280 + }, + { + "epoch": 1.945210199862164, + "grad_norm": 35.755943298339844, + "learning_rate": 3.3164061023495385e-08, + "logits/chosen": -2.318530559539795, + "logits/rejected": -2.295261859893799, + "logps/chosen": -185.0235137939453, + "logps/rejected": -217.89462280273438, + "loss": 0.5831, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -1.2432845830917358, + "rewards/margins": 0.38053396344184875, + "rewards/rejected": -1.6238186359405518, + "step": 11290 + }, + { + "epoch": 1.946933149552033, + "grad_norm": 37.60653305053711, + "learning_rate": 3.306970627600073e-08, + "logits/chosen": -2.2636165618896484, + "logits/rejected": -2.240103006362915, + "logps/chosen": -199.27163696289062, + "logps/rejected": -211.94677734375, + "loss": 0.662, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -1.392655611038208, + "rewards/margins": 0.1966453492641449, + "rewards/rejected": -1.5893008708953857, + "step": 11300 + }, + { + "epoch": 1.948656099241902, + "grad_norm": 35.965511322021484, + "learning_rate": 3.297541957818801e-08, + "logits/chosen": -2.362086772918701, + "logits/rejected": -2.3415915966033936, + "logps/chosen": -180.53945922851562, + "logps/rejected": -220.4781951904297, + "loss": 0.57, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -1.2809550762176514, + "rewards/margins": 0.4184480607509613, + "rewards/rejected": -1.699403166770935, + "step": 11310 + }, + { + "epoch": 1.9503790489317712, + "grad_norm": 26.968000411987305, + "learning_rate": 3.2881201309033555e-08, + "logits/chosen": -2.318852663040161, + "logits/rejected": -2.293492555618286, + "logps/chosen": -178.1951904296875, + "logps/rejected": -210.78713989257812, + "loss": 0.5892, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -1.2191166877746582, + "rewards/margins": 0.3563511371612549, + "rewards/rejected": -1.575467824935913, + "step": 11320 + }, + { + "epoch": 1.9521019986216404, + "grad_norm": 55.6558952331543, + "learning_rate": 3.278705184723856e-08, + "logits/chosen": -2.308856964111328, + "logits/rejected": -2.28068470954895, + "logps/chosen": -193.82093811035156, + "logps/rejected": -221.250244140625, + "loss": 0.6047, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.396980881690979, + "rewards/margins": 0.33789733052253723, + "rewards/rejected": -1.7348783016204834, + "step": 11330 + }, + { + "epoch": 1.9538249483115093, + "grad_norm": 42.39641571044922, + "learning_rate": 3.2692971571227705e-08, + "logits/chosen": -2.269031286239624, + "logits/rejected": -2.2357709407806396, + "logps/chosen": -187.25526428222656, + "logps/rejected": -226.68356323242188, + "loss": 0.5747, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -1.30891752243042, + "rewards/margins": 0.41980305314064026, + "rewards/rejected": -1.7287204265594482, + "step": 11340 + }, + { + "epoch": 1.9555478980013783, + "grad_norm": 40.96769332885742, + "learning_rate": 3.25989608591476e-08, + "logits/chosen": -2.3529152870178223, + "logits/rejected": -2.3288815021514893, + "logps/chosen": -194.79385375976562, + "logps/rejected": -220.18704223632812, + "loss": 0.6385, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.3637688159942627, + "rewards/margins": 0.2891305983066559, + "rewards/rejected": -1.652899146080017, + "step": 11350 + }, + { + "epoch": 1.9572708476912473, + "grad_norm": 37.165679931640625, + "learning_rate": 3.250502008886524e-08, + "logits/chosen": -2.311553716659546, + "logits/rejected": -2.287487506866455, + "logps/chosen": -188.62142944335938, + "logps/rejected": -216.90078735351562, + "loss": 0.599, + "rewards/accuracies": 0.65625, + "rewards/chosen": -1.3451268672943115, + "rewards/margins": 0.34311169385910034, + "rewards/rejected": -1.688238501548767, + "step": 11360 + }, + { + "epoch": 1.9589937973811165, + "grad_norm": 29.16134262084961, + "learning_rate": 3.241114963796646e-08, + "logits/chosen": -2.303574562072754, + "logits/rejected": -2.284886360168457, + "logps/chosen": -185.53944396972656, + "logps/rejected": -211.68630981445312, + "loss": 0.6121, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -1.287890076637268, + "rewards/margins": 0.3026021122932434, + "rewards/rejected": -1.5904921293258667, + "step": 11370 + }, + { + "epoch": 1.9607167470709856, + "grad_norm": 36.85075378417969, + "learning_rate": 3.231734988375447e-08, + "logits/chosen": -2.2683825492858887, + "logits/rejected": -2.2413578033447266, + "logps/chosen": -181.06224060058594, + "logps/rejected": -215.37014770507812, + "loss": 0.605, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -1.2876007556915283, + "rewards/margins": 0.3665514588356018, + "rewards/rejected": -1.6541521549224854, + "step": 11380 + }, + { + "epoch": 1.9624396967608546, + "grad_norm": 30.369298934936523, + "learning_rate": 3.222362120324837e-08, + "logits/chosen": -2.3610501289367676, + "logits/rejected": -2.331861972808838, + "logps/chosen": -169.66964721679688, + "logps/rejected": -206.345703125, + "loss": 0.6025, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -1.16221022605896, + "rewards/margins": 0.3576747179031372, + "rewards/rejected": -1.5198849439620972, + "step": 11390 + }, + { + "epoch": 1.9641626464507236, + "grad_norm": 30.62260627746582, + "learning_rate": 3.2129963973181526e-08, + "logits/chosen": -2.341277599334717, + "logits/rejected": -2.3116092681884766, + "logps/chosen": -168.75254821777344, + "logps/rejected": -199.89877319335938, + "loss": 0.5973, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -1.1201581954956055, + "rewards/margins": 0.357011616230011, + "rewards/rejected": -1.4771697521209717, + "step": 11400 + }, + { + "epoch": 1.9658855961405926, + "grad_norm": 34.26206588745117, + "learning_rate": 3.2036378570000146e-08, + "logits/chosen": -2.3451850414276123, + "logits/rejected": -2.3163180351257324, + "logps/chosen": -168.078125, + "logps/rejected": -200.7991485595703, + "loss": 0.5808, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -1.1173222064971924, + "rewards/margins": 0.35028529167175293, + "rewards/rejected": -1.4676073789596558, + "step": 11410 + }, + { + "epoch": 1.9676085458304617, + "grad_norm": 31.360225677490234, + "learning_rate": 3.1942865369861704e-08, + "logits/chosen": -2.313110113143921, + "logits/rejected": -2.287832736968994, + "logps/chosen": -178.32369995117188, + "logps/rejected": -197.38841247558594, + "loss": 0.6376, + "rewards/accuracies": 0.59375, + "rewards/chosen": -1.2378849983215332, + "rewards/margins": 0.2414713352918625, + "rewards/rejected": -1.4793564081192017, + "step": 11420 + }, + { + "epoch": 1.969331495520331, + "grad_norm": 35.559757232666016, + "learning_rate": 3.18494247486335e-08, + "logits/chosen": -2.277827739715576, + "logits/rejected": -2.254122257232666, + "logps/chosen": -170.04818725585938, + "logps/rejected": -196.9369354248047, + "loss": 0.607, + "rewards/accuracies": 0.65625, + "rewards/chosen": -1.1452734470367432, + "rewards/margins": 0.3120071291923523, + "rewards/rejected": -1.4572807550430298, + "step": 11430 + }, + { + "epoch": 1.9710544452102, + "grad_norm": 28.967737197875977, + "learning_rate": 3.1756057081891104e-08, + "logits/chosen": -2.31591796875, + "logits/rejected": -2.3056747913360596, + "logps/chosen": -167.77761840820312, + "logps/rejected": -187.48861694335938, + "loss": 0.6362, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.1490370035171509, + "rewards/margins": 0.20246684551239014, + "rewards/rejected": -1.3515037298202515, + "step": 11440 + }, + { + "epoch": 1.9727773949000689, + "grad_norm": 33.354129791259766, + "learning_rate": 3.166276274491684e-08, + "logits/chosen": -2.3166346549987793, + "logits/rejected": -2.2880496978759766, + "logps/chosen": -165.89306640625, + "logps/rejected": -200.16159057617188, + "loss": 0.5776, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -1.1227996349334717, + "rewards/margins": 0.36727264523506165, + "rewards/rejected": -1.490072250366211, + "step": 11450 + }, + { + "epoch": 1.9745003445899378, + "grad_norm": 27.8592529296875, + "learning_rate": 3.156954211269828e-08, + "logits/chosen": -2.3116652965545654, + "logits/rejected": -2.2855889797210693, + "logps/chosen": -167.8182373046875, + "logps/rejected": -199.59274291992188, + "loss": 0.593, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -1.1173088550567627, + "rewards/margins": 0.34456413984298706, + "rewards/rejected": -1.4618730545043945, + "step": 11460 + }, + { + "epoch": 1.976223294279807, + "grad_norm": 33.00469207763672, + "learning_rate": 3.147639555992677e-08, + "logits/chosen": -2.3325459957122803, + "logits/rejected": -2.2897353172302246, + "logps/chosen": -187.8080596923828, + "logps/rejected": -216.45242309570312, + "loss": 0.5906, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -1.2975906133651733, + "rewards/margins": 0.36165517568588257, + "rewards/rejected": -1.6592458486557007, + "step": 11470 + }, + { + "epoch": 1.9779462439696762, + "grad_norm": 35.083377838134766, + "learning_rate": 3.138332346099587e-08, + "logits/chosen": -2.3586764335632324, + "logits/rejected": -2.320845603942871, + "logps/chosen": -170.81422424316406, + "logps/rejected": -194.9658966064453, + "loss": 0.6084, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -1.132930040359497, + "rewards/margins": 0.3183945119380951, + "rewards/rejected": -1.4513245820999146, + "step": 11480 + }, + { + "epoch": 1.9796691936595452, + "grad_norm": 31.931344985961914, + "learning_rate": 3.129032618999994e-08, + "logits/chosen": -2.3237130641937256, + "logits/rejected": -2.2973456382751465, + "logps/chosen": -183.87399291992188, + "logps/rejected": -200.5390625, + "loss": 0.6421, + "rewards/accuracies": 0.606249988079071, + "rewards/chosen": -1.2792707681655884, + "rewards/margins": 0.219514399766922, + "rewards/rejected": -1.4987852573394775, + "step": 11490 + }, + { + "epoch": 1.9813921433494142, + "grad_norm": 33.240440368652344, + "learning_rate": 3.119740412073252e-08, + "logits/chosen": -2.346156597137451, + "logits/rejected": -2.3272767066955566, + "logps/chosen": -165.01187133789062, + "logps/rejected": -182.28871154785156, + "loss": 0.6523, + "rewards/accuracies": 0.6187499761581421, + "rewards/chosen": -1.1343519687652588, + "rewards/margins": 0.1887514442205429, + "rewards/rejected": -1.3231033086776733, + "step": 11500 + }, + { + "epoch": 1.9831150930392831, + "grad_norm": 49.612754821777344, + "learning_rate": 3.1104557626684884e-08, + "logits/chosen": -2.302736282348633, + "logits/rejected": -2.2873153686523438, + "logps/chosen": -168.75994873046875, + "logps/rejected": -200.77427673339844, + "loss": 0.6094, + "rewards/accuracies": 0.65625, + "rewards/chosen": -1.1248788833618164, + "rewards/margins": 0.3630138337612152, + "rewards/rejected": -1.487892746925354, + "step": 11510 + }, + { + "epoch": 1.9848380427291523, + "grad_norm": 24.72926902770996, + "learning_rate": 3.101178708104456e-08, + "logits/chosen": -2.3570656776428223, + "logits/rejected": -2.3030755519866943, + "logps/chosen": -162.44204711914062, + "logps/rejected": -187.81796264648438, + "loss": 0.5762, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -1.0133875608444214, + "rewards/margins": 0.3679368793964386, + "rewards/rejected": -1.3813245296478271, + "step": 11520 + }, + { + "epoch": 1.9865609924190215, + "grad_norm": 27.86030387878418, + "learning_rate": 3.091909285669383e-08, + "logits/chosen": -2.326167583465576, + "logits/rejected": -2.2939858436584473, + "logps/chosen": -157.47518920898438, + "logps/rejected": -177.9588623046875, + "loss": 0.6127, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.0260807275772095, + "rewards/margins": 0.27049964666366577, + "rewards/rejected": -1.29658043384552, + "step": 11530 + }, + { + "epoch": 1.9882839421088905, + "grad_norm": 30.32522964477539, + "learning_rate": 3.082647532620817e-08, + "logits/chosen": -2.359875440597534, + "logits/rejected": -2.3462164402008057, + "logps/chosen": -158.7584686279297, + "logps/rejected": -202.20672607421875, + "loss": 0.5737, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -1.0689572095870972, + "rewards/margins": 0.4044581353664398, + "rewards/rejected": -1.4734153747558594, + "step": 11540 + }, + { + "epoch": 1.9900068917987594, + "grad_norm": 32.972084045410156, + "learning_rate": 3.0733934861854794e-08, + "logits/chosen": -2.3069303035736084, + "logits/rejected": -2.2892849445343018, + "logps/chosen": -162.90286254882812, + "logps/rejected": -193.65733337402344, + "loss": 0.6039, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -1.1045806407928467, + "rewards/margins": 0.3092039227485657, + "rewards/rejected": -1.4137846231460571, + "step": 11550 + }, + { + "epoch": 1.9917298414886284, + "grad_norm": 31.083885192871094, + "learning_rate": 3.0641471835591184e-08, + "logits/chosen": -2.3341760635375977, + "logits/rejected": -2.3034563064575195, + "logps/chosen": -171.6559600830078, + "logps/rejected": -197.1893768310547, + "loss": 0.6091, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -1.1536730527877808, + "rewards/margins": 0.3134633004665375, + "rewards/rejected": -1.467136263847351, + "step": 11560 + }, + { + "epoch": 1.9934527911784976, + "grad_norm": 28.85565185546875, + "learning_rate": 3.054908661906353e-08, + "logits/chosen": -2.350050210952759, + "logits/rejected": -2.329953193664551, + "logps/chosen": -171.4935760498047, + "logps/rejected": -201.5003662109375, + "loss": 0.6215, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -1.1677134037017822, + "rewards/margins": 0.3004491329193115, + "rewards/rejected": -1.4681622982025146, + "step": 11570 + }, + { + "epoch": 1.9951757408683668, + "grad_norm": 29.845497131347656, + "learning_rate": 3.045677958360532e-08, + "logits/chosen": -2.3807244300842285, + "logits/rejected": -2.356279134750366, + "logps/chosen": -170.1651611328125, + "logps/rejected": -197.697265625, + "loss": 0.6111, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -1.1297045946121216, + "rewards/margins": 0.30230602622032166, + "rewards/rejected": -1.432010531425476, + "step": 11580 + }, + { + "epoch": 1.9968986905582358, + "grad_norm": 38.318115234375, + "learning_rate": 3.0364551100235795e-08, + "logits/chosen": -2.295039415359497, + "logits/rejected": -2.261448860168457, + "logps/chosen": -164.27969360351562, + "logps/rejected": -175.9013214111328, + "loss": 0.6335, + "rewards/accuracies": 0.65625, + "rewards/chosen": -1.0416390895843506, + "rewards/margins": 0.21523399651050568, + "rewards/rejected": -1.2568730115890503, + "step": 11590 + }, + { + "epoch": 1.9986216402481047, + "grad_norm": 23.599227905273438, + "learning_rate": 3.027240153965839e-08, + "logits/chosen": -2.360468626022339, + "logits/rejected": -2.345468759536743, + "logps/chosen": -149.54684448242188, + "logps/rejected": -173.2725372314453, + "loss": 0.63, + "rewards/accuracies": 0.59375, + "rewards/chosen": -0.981712818145752, + "rewards/margins": 0.2234046459197998, + "rewards/rejected": -1.2051174640655518, + "step": 11600 + }, + { + "epoch": 1.9986216402481047, + "eval_logits/chosen": -2.410827159881592, + "eval_logits/rejected": -2.401189088821411, + "eval_logps/chosen": -147.70538330078125, + "eval_logps/rejected": -166.54612731933594, + "eval_loss": 0.6502494812011719, + "eval_rewards/accuracies": 0.6003717184066772, + "eval_rewards/chosen": -0.886898934841156, + "eval_rewards/margins": 0.15106609463691711, + "eval_rewards/rejected": -1.0379650592803955, + "eval_runtime": 384.8158, + "eval_samples_per_second": 11.185, + "eval_steps_per_second": 1.398, + "step": 11600 + }, + { + "epoch": 2.0003445899379737, + "grad_norm": 33.16267776489258, + "learning_rate": 3.0180331272259404e-08, + "logits/chosen": -2.310610771179199, + "logits/rejected": -2.2826197147369385, + "logps/chosen": -155.88720703125, + "logps/rejected": -188.38926696777344, + "loss": 0.5894, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -1.0316044092178345, + "rewards/margins": 0.3397893011569977, + "rewards/rejected": -1.3713937997817993, + "step": 11610 + }, + { + "epoch": 2.0020675396278427, + "grad_norm": 32.0125617980957, + "learning_rate": 3.0088340668106376e-08, + "logits/chosen": -2.345252275466919, + "logits/rejected": -2.328648805618286, + "logps/chosen": -167.07174682617188, + "logps/rejected": -193.3257598876953, + "loss": 0.621, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -1.098212480545044, + "rewards/margins": 0.2750307023525238, + "rewards/rejected": -1.3732430934906006, + "step": 11620 + }, + { + "epoch": 2.003790489317712, + "grad_norm": 29.57107925415039, + "learning_rate": 2.999643009694671e-08, + "logits/chosen": -2.3526864051818848, + "logits/rejected": -2.3216300010681152, + "logps/chosen": -166.13430786132812, + "logps/rejected": -191.68673706054688, + "loss": 0.6018, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.074463129043579, + "rewards/margins": 0.3187711238861084, + "rewards/rejected": -1.393234372138977, + "step": 11630 + }, + { + "epoch": 2.005513439007581, + "grad_norm": 28.913124084472656, + "learning_rate": 2.990459992820601e-08, + "logits/chosen": -2.365588426589966, + "logits/rejected": -2.34390926361084, + "logps/chosen": -157.5725555419922, + "logps/rejected": -182.8646697998047, + "loss": 0.6065, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -1.0403567552566528, + "rewards/margins": 0.29022300243377686, + "rewards/rejected": -1.3305796384811401, + "step": 11640 + }, + { + "epoch": 2.00723638869745, + "grad_norm": 26.71157455444336, + "learning_rate": 2.981285053098682e-08, + "logits/chosen": -2.2904579639434814, + "logits/rejected": -2.2564196586608887, + "logps/chosen": -150.6053924560547, + "logps/rejected": -181.81773376464844, + "loss": 0.5712, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -0.9526816606521606, + "rewards/margins": 0.35900914669036865, + "rewards/rejected": -1.3116909265518188, + "step": 11650 + }, + { + "epoch": 2.008959338387319, + "grad_norm": 26.587501525878906, + "learning_rate": 2.972118227406698e-08, + "logits/chosen": -2.367176055908203, + "logits/rejected": -2.3259403705596924, + "logps/chosen": -171.75442504882812, + "logps/rejected": -195.70761108398438, + "loss": 0.6051, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -1.1537991762161255, + "rewards/margins": 0.31325364112854004, + "rewards/rejected": -1.467052698135376, + "step": 11660 + }, + { + "epoch": 2.010682288077188, + "grad_norm": 39.934181213378906, + "learning_rate": 2.9629595525898188e-08, + "logits/chosen": -2.3322441577911377, + "logits/rejected": -2.292595624923706, + "logps/chosen": -169.62432861328125, + "logps/rejected": -215.9363250732422, + "loss": 0.5333, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -1.1304153203964233, + "rewards/margins": 0.5008441805839539, + "rewards/rejected": -1.6312596797943115, + "step": 11670 + }, + { + "epoch": 2.0124052377670574, + "grad_norm": 34.38392639160156, + "learning_rate": 2.9538090654604596e-08, + "logits/chosen": -2.2712364196777344, + "logits/rejected": -2.2498137950897217, + "logps/chosen": -174.3423614501953, + "logps/rejected": -211.03689575195312, + "loss": 0.5887, + "rewards/accuracies": 0.71875, + "rewards/chosen": -1.167636513710022, + "rewards/margins": 0.39166468381881714, + "rewards/rejected": -1.5593011379241943, + "step": 11680 + }, + { + "epoch": 2.0141281874569263, + "grad_norm": 31.447341918945312, + "learning_rate": 2.9446668027981127e-08, + "logits/chosen": -2.3560667037963867, + "logits/rejected": -2.3193726539611816, + "logps/chosen": -185.02783203125, + "logps/rejected": -213.715576171875, + "loss": 0.5913, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -1.2407398223876953, + "rewards/margins": 0.3654331564903259, + "rewards/rejected": -1.606172800064087, + "step": 11690 + }, + { + "epoch": 2.0158511371467953, + "grad_norm": 24.823726654052734, + "learning_rate": 2.9355328013492255e-08, + "logits/chosen": -2.406175136566162, + "logits/rejected": -2.3746700286865234, + "logps/chosen": -162.5546112060547, + "logps/rejected": -196.85719299316406, + "loss": 0.5813, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -1.05254328250885, + "rewards/margins": 0.3555363118648529, + "rewards/rejected": -1.4080796241760254, + "step": 11700 + }, + { + "epoch": 2.0175740868366643, + "grad_norm": 41.32663345336914, + "learning_rate": 2.926407097827034e-08, + "logits/chosen": -2.3307394981384277, + "logits/rejected": -2.298651933670044, + "logps/chosen": -172.9115753173828, + "logps/rejected": -200.8367919921875, + "loss": 0.5956, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -1.1703336238861084, + "rewards/margins": 0.3152288496494293, + "rewards/rejected": -1.4855625629425049, + "step": 11710 + }, + { + "epoch": 2.0192970365265333, + "grad_norm": 34.57819747924805, + "learning_rate": 2.917289728911424e-08, + "logits/chosen": -2.358710527420044, + "logits/rejected": -2.3419289588928223, + "logps/chosen": -177.53518676757812, + "logps/rejected": -204.48703002929688, + "loss": 0.627, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -1.223528265953064, + "rewards/margins": 0.2960992753505707, + "rewards/rejected": -1.5196274518966675, + "step": 11720 + }, + { + "epoch": 2.0210199862164027, + "grad_norm": 27.626047134399414, + "learning_rate": 2.90818073124878e-08, + "logits/chosen": -2.306004047393799, + "logits/rejected": -2.2871932983398438, + "logps/chosen": -179.81082153320312, + "logps/rejected": -212.8517303466797, + "loss": 0.6091, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.2425450086593628, + "rewards/margins": 0.3176325261592865, + "rewards/rejected": -1.5601775646209717, + "step": 11730 + }, + { + "epoch": 2.0227429359062716, + "grad_norm": 31.49341583251953, + "learning_rate": 2.899080141451836e-08, + "logits/chosen": -2.345893621444702, + "logits/rejected": -2.32354474067688, + "logps/chosen": -174.951171875, + "logps/rejected": -205.5146484375, + "loss": 0.612, + "rewards/accuracies": 0.65625, + "rewards/chosen": -1.2136671543121338, + "rewards/margins": 0.3185957670211792, + "rewards/rejected": -1.5322628021240234, + "step": 11740 + }, + { + "epoch": 2.0244658855961406, + "grad_norm": 29.300275802612305, + "learning_rate": 2.8899879960995376e-08, + "logits/chosen": -2.3585095405578613, + "logits/rejected": -2.34792160987854, + "logps/chosen": -163.48052978515625, + "logps/rejected": -211.85824584960938, + "loss": 0.564, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -1.141412377357483, + "rewards/margins": 0.4344809651374817, + "rewards/rejected": -1.5758932828903198, + "step": 11750 + }, + { + "epoch": 2.0261888352860096, + "grad_norm": 33.636085510253906, + "learning_rate": 2.8809043317368876e-08, + "logits/chosen": -2.3376007080078125, + "logits/rejected": -2.3140745162963867, + "logps/chosen": -181.65231323242188, + "logps/rejected": -219.00009155273438, + "loss": 0.5894, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -1.2589913606643677, + "rewards/margins": 0.40537405014038086, + "rewards/rejected": -1.664365530014038, + "step": 11760 + }, + { + "epoch": 2.0279117849758785, + "grad_norm": 25.824485778808594, + "learning_rate": 2.871829184874795e-08, + "logits/chosen": -2.285428524017334, + "logits/rejected": -2.2535977363586426, + "logps/chosen": -181.88848876953125, + "logps/rejected": -210.92041015625, + "loss": 0.5936, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -1.2358392477035522, + "rewards/margins": 0.36059147119522095, + "rewards/rejected": -1.596430778503418, + "step": 11770 + }, + { + "epoch": 2.029634734665748, + "grad_norm": 38.72450637817383, + "learning_rate": 2.8627625919899363e-08, + "logits/chosen": -2.2584919929504395, + "logits/rejected": -2.238612413406372, + "logps/chosen": -175.74777221679688, + "logps/rejected": -206.83676147460938, + "loss": 0.613, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.2549865245819092, + "rewards/margins": 0.2964690625667572, + "rewards/rejected": -1.5514557361602783, + "step": 11780 + }, + { + "epoch": 2.031357684355617, + "grad_norm": 31.6374454498291, + "learning_rate": 2.8537045895246103e-08, + "logits/chosen": -2.285067081451416, + "logits/rejected": -2.274216413497925, + "logps/chosen": -170.0199737548828, + "logps/rejected": -225.0682830810547, + "loss": 0.5148, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -1.195155382156372, + "rewards/margins": 0.5274872779846191, + "rewards/rejected": -1.7226425409317017, + "step": 11790 + }, + { + "epoch": 2.033080634045486, + "grad_norm": 40.261634826660156, + "learning_rate": 2.8446552138865797e-08, + "logits/chosen": -2.297799587249756, + "logits/rejected": -2.2750699520111084, + "logps/chosen": -197.58456420898438, + "logps/rejected": -222.3473663330078, + "loss": 0.6118, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -1.3904147148132324, + "rewards/margins": 0.30489349365234375, + "rewards/rejected": -1.6953080892562866, + "step": 11800 + }, + { + "epoch": 2.034803583735355, + "grad_norm": 37.03939437866211, + "learning_rate": 2.8356145014489408e-08, + "logits/chosen": -2.2964396476745605, + "logits/rejected": -2.2717151641845703, + "logps/chosen": -200.8223114013672, + "logps/rejected": -228.19686889648438, + "loss": 0.6273, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -1.4382585287094116, + "rewards/margins": 0.3232322633266449, + "rewards/rejected": -1.761490821838379, + "step": 11810 + }, + { + "epoch": 2.036526533425224, + "grad_norm": 30.490917205810547, + "learning_rate": 2.8265824885499605e-08, + "logits/chosen": -2.3136515617370605, + "logits/rejected": -2.2980868816375732, + "logps/chosen": -187.7859649658203, + "logps/rejected": -209.8665313720703, + "loss": 0.6629, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -1.3637282848358154, + "rewards/margins": 0.2283482849597931, + "rewards/rejected": -1.5920765399932861, + "step": 11820 + }, + { + "epoch": 2.0382494831150932, + "grad_norm": 44.555973052978516, + "learning_rate": 2.817559211492948e-08, + "logits/chosen": -2.2768752574920654, + "logits/rejected": -2.265839099884033, + "logps/chosen": -178.7625732421875, + "logps/rejected": -213.99478149414062, + "loss": 0.6033, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -1.2605791091918945, + "rewards/margins": 0.3402387499809265, + "rewards/rejected": -1.6008179187774658, + "step": 11830 + }, + { + "epoch": 2.039972432804962, + "grad_norm": 28.042152404785156, + "learning_rate": 2.80854470654609e-08, + "logits/chosen": -2.2855865955352783, + "logits/rejected": -2.257840871810913, + "logps/chosen": -186.37057495117188, + "logps/rejected": -222.32763671875, + "loss": 0.585, + "rewards/accuracies": 0.71875, + "rewards/chosen": -1.3317365646362305, + "rewards/margins": 0.37816449999809265, + "rewards/rejected": -1.7099010944366455, + "step": 11840 + }, + { + "epoch": 2.041695382494831, + "grad_norm": 32.66047668457031, + "learning_rate": 2.7995390099423217e-08, + "logits/chosen": -2.252861261367798, + "logits/rejected": -2.216279983520508, + "logps/chosen": -189.672607421875, + "logps/rejected": -216.9271697998047, + "loss": 0.6055, + "rewards/accuracies": 0.65625, + "rewards/chosen": -1.3427212238311768, + "rewards/margins": 0.34207189083099365, + "rewards/rejected": -1.68479323387146, + "step": 11850 + }, + { + "epoch": 2.0434183321847, + "grad_norm": 51.951820373535156, + "learning_rate": 2.7905421578791754e-08, + "logits/chosen": -2.326028347015381, + "logits/rejected": -2.3146190643310547, + "logps/chosen": -199.56216430664062, + "logps/rejected": -230.6979522705078, + "loss": 0.6341, + "rewards/accuracies": 0.6187499761581421, + "rewards/chosen": -1.4476042985916138, + "rewards/margins": 0.28900259733200073, + "rewards/rejected": -1.7366068363189697, + "step": 11860 + }, + { + "epoch": 2.045141281874569, + "grad_norm": 57.09029006958008, + "learning_rate": 2.7815541865186215e-08, + "logits/chosen": -2.2642533779144287, + "logits/rejected": -2.2513880729675293, + "logps/chosen": -184.52235412597656, + "logps/rejected": -224.866943359375, + "loss": 0.585, + "rewards/accuracies": 0.71875, + "rewards/chosen": -1.3491449356079102, + "rewards/margins": 0.36590269207954407, + "rewards/rejected": -1.7150475978851318, + "step": 11870 + }, + { + "epoch": 2.0468642315644385, + "grad_norm": 35.73862075805664, + "learning_rate": 2.7725751319869485e-08, + "logits/chosen": -2.284669876098633, + "logits/rejected": -2.2507481575012207, + "logps/chosen": -190.849365234375, + "logps/rejected": -236.66421508789062, + "loss": 0.5279, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -1.3258202075958252, + "rewards/margins": 0.49654918909072876, + "rewards/rejected": -1.8223693370819092, + "step": 11880 + }, + { + "epoch": 2.0485871812543075, + "grad_norm": 39.445770263671875, + "learning_rate": 2.7636050303746004e-08, + "logits/chosen": -2.3135440349578857, + "logits/rejected": -2.288027763366699, + "logps/chosen": -203.68576049804688, + "logps/rejected": -235.73757934570312, + "loss": 0.5891, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -1.456667423248291, + "rewards/margins": 0.382424533367157, + "rewards/rejected": -1.8390918970108032, + "step": 11890 + }, + { + "epoch": 2.0503101309441765, + "grad_norm": 54.75608825683594, + "learning_rate": 2.7546439177360336e-08, + "logits/chosen": -2.2159829139709473, + "logits/rejected": -2.188356637954712, + "logps/chosen": -197.61387634277344, + "logps/rejected": -231.9712677001953, + "loss": 0.6042, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -1.436052680015564, + "rewards/margins": 0.37783369421958923, + "rewards/rejected": -1.8138864040374756, + "step": 11900 + }, + { + "epoch": 2.0520330806340454, + "grad_norm": 31.257429122924805, + "learning_rate": 2.7456918300895748e-08, + "logits/chosen": -2.351876735687256, + "logits/rejected": -2.3544445037841797, + "logps/chosen": -193.8394012451172, + "logps/rejected": -233.034423828125, + "loss": 0.6153, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -1.404994249343872, + "rewards/margins": 0.352558970451355, + "rewards/rejected": -1.7575533390045166, + "step": 11910 + }, + { + "epoch": 2.0537560303239144, + "grad_norm": 56.795692443847656, + "learning_rate": 2.736748803417277e-08, + "logits/chosen": -2.3139095306396484, + "logits/rejected": -2.2963128089904785, + "logps/chosen": -198.30126953125, + "logps/rejected": -228.29336547851562, + "loss": 0.6077, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -1.3945560455322266, + "rewards/margins": 0.32481223344802856, + "rewards/rejected": -1.7193682193756104, + "step": 11920 + }, + { + "epoch": 2.055478980013784, + "grad_norm": 32.29816818237305, + "learning_rate": 2.7278148736647748e-08, + "logits/chosen": -2.3049678802490234, + "logits/rejected": -2.289769411087036, + "logps/chosen": -184.04885864257812, + "logps/rejected": -221.4799346923828, + "loss": 0.5865, + "rewards/accuracies": 0.65625, + "rewards/chosen": -1.3049168586730957, + "rewards/margins": 0.36031457781791687, + "rewards/rejected": -1.665231466293335, + "step": 11930 + }, + { + "epoch": 2.057201929703653, + "grad_norm": 32.54777145385742, + "learning_rate": 2.7188900767411338e-08, + "logits/chosen": -2.2739245891571045, + "logits/rejected": -2.247042179107666, + "logps/chosen": -177.62771606445312, + "logps/rejected": -216.1691131591797, + "loss": 0.5644, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -1.2208917140960693, + "rewards/margins": 0.41405215859413147, + "rewards/rejected": -1.6349437236785889, + "step": 11940 + }, + { + "epoch": 2.0589248793935218, + "grad_norm": 28.15619468688965, + "learning_rate": 2.709974448518718e-08, + "logits/chosen": -2.345357656478882, + "logits/rejected": -2.3193631172180176, + "logps/chosen": -193.7064666748047, + "logps/rejected": -215.07058715820312, + "loss": 0.6425, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.3472740650177002, + "rewards/margins": 0.24748286604881287, + "rewards/rejected": -1.594757080078125, + "step": 11950 + }, + { + "epoch": 2.0606478290833907, + "grad_norm": 33.60971450805664, + "learning_rate": 2.7010680248330307e-08, + "logits/chosen": -2.2373039722442627, + "logits/rejected": -2.2140448093414307, + "logps/chosen": -184.1251220703125, + "logps/rejected": -227.1833953857422, + "loss": 0.5664, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -1.306465744972229, + "rewards/margins": 0.4474376142024994, + "rewards/rejected": -1.7539036273956299, + "step": 11960 + }, + { + "epoch": 2.0623707787732597, + "grad_norm": 33.096187591552734, + "learning_rate": 2.6921708414825857e-08, + "logits/chosen": -2.3217430114746094, + "logits/rejected": -2.297858715057373, + "logps/chosen": -187.7861328125, + "logps/rejected": -216.83102416992188, + "loss": 0.6186, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -1.306571125984192, + "rewards/margins": 0.33881863951683044, + "rewards/rejected": -1.6453897953033447, + "step": 11970 + }, + { + "epoch": 2.0640937284631287, + "grad_norm": 24.070341110229492, + "learning_rate": 2.6832829342287488e-08, + "logits/chosen": -2.308072090148926, + "logits/rejected": -2.2790815830230713, + "logps/chosen": -180.55453491210938, + "logps/rejected": -217.0457763671875, + "loss": 0.5715, + "rewards/accuracies": 0.71875, + "rewards/chosen": -1.24399733543396, + "rewards/margins": 0.39089125394821167, + "rewards/rejected": -1.6348886489868164, + "step": 11980 + }, + { + "epoch": 2.065816678152998, + "grad_norm": 39.581634521484375, + "learning_rate": 2.674404338795611e-08, + "logits/chosen": -2.369114637374878, + "logits/rejected": -2.329042434692383, + "logps/chosen": -181.17039489746094, + "logps/rejected": -206.80126953125, + "loss": 0.5977, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -1.2240442037582397, + "rewards/margins": 0.3359059989452362, + "rewards/rejected": -1.5599501132965088, + "step": 11990 + }, + { + "epoch": 2.067539627842867, + "grad_norm": 26.552762985229492, + "learning_rate": 2.665535090869827e-08, + "logits/chosen": -2.284182071685791, + "logits/rejected": -2.271190881729126, + "logps/chosen": -173.64321899414062, + "logps/rejected": -212.55203247070312, + "loss": 0.5891, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -1.2180447578430176, + "rewards/margins": 0.3685550391674042, + "rewards/rejected": -1.5865997076034546, + "step": 12000 + }, + { + "epoch": 2.067539627842867, + "eval_logits/chosen": -2.382530927658081, + "eval_logits/rejected": -2.37129282951355, + "eval_logps/chosen": -163.54176330566406, + "eval_logps/rejected": -183.97140502929688, + "eval_loss": 0.6490315794944763, + "eval_rewards/accuracies": 0.6045538783073425, + "eval_rewards/chosen": -1.0452629327774048, + "eval_rewards/margins": 0.16695521771907806, + "eval_rewards/rejected": -1.2122180461883545, + "eval_runtime": 384.8046, + "eval_samples_per_second": 11.185, + "eval_steps_per_second": 1.398, + "step": 12000 + }, + { + "epoch": 2.069262577532736, + "grad_norm": 42.818328857421875, + "learning_rate": 2.656675226100481e-08, + "logits/chosen": -2.3415584564208984, + "logits/rejected": -2.309140920639038, + "logps/chosen": -180.33029174804688, + "logps/rejected": -211.2962646484375, + "loss": 0.605, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -1.2456438541412354, + "rewards/margins": 0.3233585059642792, + "rewards/rejected": -1.569002389907837, + "step": 12010 + }, + { + "epoch": 2.070985527222605, + "grad_norm": 31.882787704467773, + "learning_rate": 2.6478247800989474e-08, + "logits/chosen": -2.3419384956359863, + "logits/rejected": -2.3203039169311523, + "logps/chosen": -169.97256469726562, + "logps/rejected": -198.49661254882812, + "loss": 0.6165, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -1.167227029800415, + "rewards/margins": 0.29829588532447815, + "rewards/rejected": -1.4655230045318604, + "step": 12020 + }, + { + "epoch": 2.072708476912474, + "grad_norm": 42.521568298339844, + "learning_rate": 2.63898378843874e-08, + "logits/chosen": -2.399444103240967, + "logits/rejected": -2.383762836456299, + "logps/chosen": -160.01272583007812, + "logps/rejected": -182.17120361328125, + "loss": 0.6475, + "rewards/accuracies": 0.6187499761581421, + "rewards/chosen": -1.098426103591919, + "rewards/margins": 0.21871300041675568, + "rewards/rejected": -1.3171392679214478, + "step": 12030 + }, + { + "epoch": 2.0744314266023434, + "grad_norm": 27.004297256469727, + "learning_rate": 2.6301522866553714e-08, + "logits/chosen": -2.3278212547302246, + "logits/rejected": -2.2990002632141113, + "logps/chosen": -170.15162658691406, + "logps/rejected": -200.32017517089844, + "loss": 0.5964, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -1.1340413093566895, + "rewards/margins": 0.35810285806655884, + "rewards/rejected": -1.4921441078186035, + "step": 12040 + }, + { + "epoch": 2.0761543762922123, + "grad_norm": 34.77950668334961, + "learning_rate": 2.621330310246208e-08, + "logits/chosen": -2.37144136428833, + "logits/rejected": -2.3289756774902344, + "logps/chosen": -166.37918090820312, + "logps/rejected": -203.81527709960938, + "loss": 0.5591, + "rewards/accuracies": 0.71875, + "rewards/chosen": -1.1094896793365479, + "rewards/margins": 0.4241317808628082, + "rewards/rejected": -1.5336215496063232, + "step": 12050 + }, + { + "epoch": 2.0778773259820813, + "grad_norm": 30.318689346313477, + "learning_rate": 2.6125178946703352e-08, + "logits/chosen": -2.334500312805176, + "logits/rejected": -2.3106274604797363, + "logps/chosen": -171.7869415283203, + "logps/rejected": -195.88204956054688, + "loss": 0.6165, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -1.151228904724121, + "rewards/margins": 0.27953606843948364, + "rewards/rejected": -1.43076491355896, + "step": 12060 + }, + { + "epoch": 2.0796002756719503, + "grad_norm": 26.932645797729492, + "learning_rate": 2.6037150753484082e-08, + "logits/chosen": -2.3272311687469482, + "logits/rejected": -2.286487102508545, + "logps/chosen": -170.1809539794922, + "logps/rejected": -203.5109100341797, + "loss": 0.5733, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -1.1393085718154907, + "rewards/margins": 0.4047151505947113, + "rewards/rejected": -1.5440236330032349, + "step": 12070 + }, + { + "epoch": 2.0813232253618192, + "grad_norm": 33.24617385864258, + "learning_rate": 2.594921887662509e-08, + "logits/chosen": -2.2792840003967285, + "logits/rejected": -2.2516608238220215, + "logps/chosen": -165.46157836914062, + "logps/rejected": -200.6604461669922, + "loss": 0.5774, + "rewards/accuracies": 0.71875, + "rewards/chosen": -1.0833677053451538, + "rewards/margins": 0.38120564818382263, + "rewards/rejected": -1.4645735025405884, + "step": 12080 + }, + { + "epoch": 2.0830461750516887, + "grad_norm": 34.20355987548828, + "learning_rate": 2.5861383669560045e-08, + "logits/chosen": -2.314237594604492, + "logits/rejected": -2.2838430404663086, + "logps/chosen": -178.33566284179688, + "logps/rejected": -203.56239318847656, + "loss": 0.613, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -1.2032502889633179, + "rewards/margins": 0.3110768496990204, + "rewards/rejected": -1.514327049255371, + "step": 12090 + }, + { + "epoch": 2.0847691247415576, + "grad_norm": 30.14881134033203, + "learning_rate": 2.5773645485334122e-08, + "logits/chosen": -2.303605318069458, + "logits/rejected": -2.282360076904297, + "logps/chosen": -169.51351928710938, + "logps/rejected": -194.35842895507812, + "loss": 0.5954, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.1090033054351807, + "rewards/margins": 0.3162084221839905, + "rewards/rejected": -1.4252115488052368, + "step": 12100 + }, + { + "epoch": 2.0864920744314266, + "grad_norm": 31.299373626708984, + "learning_rate": 2.568600467660245e-08, + "logits/chosen": -2.326810836791992, + "logits/rejected": -2.2981925010681152, + "logps/chosen": -177.06642150878906, + "logps/rejected": -201.6583709716797, + "loss": 0.6135, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -1.1888201236724854, + "rewards/margins": 0.29908499121665955, + "rewards/rejected": -1.4879052639007568, + "step": 12110 + }, + { + "epoch": 2.0882150241212956, + "grad_norm": 37.546592712402344, + "learning_rate": 2.5598461595628827e-08, + "logits/chosen": -2.306678295135498, + "logits/rejected": -2.2846360206604004, + "logps/chosen": -178.78823852539062, + "logps/rejected": -204.18438720703125, + "loss": 0.6238, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -1.2205679416656494, + "rewards/margins": 0.3085453510284424, + "rewards/rejected": -1.5291130542755127, + "step": 12120 + }, + { + "epoch": 2.0899379738111645, + "grad_norm": 29.08656883239746, + "learning_rate": 2.5511016594284236e-08, + "logits/chosen": -2.310135841369629, + "logits/rejected": -2.276185989379883, + "logps/chosen": -166.3862762451172, + "logps/rejected": -207.1776123046875, + "loss": 0.5526, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -1.1164814233779907, + "rewards/margins": 0.4281376004219055, + "rewards/rejected": -1.5446192026138306, + "step": 12130 + }, + { + "epoch": 2.091660923501034, + "grad_norm": 36.57272720336914, + "learning_rate": 2.5423670024045397e-08, + "logits/chosen": -2.3395144939422607, + "logits/rejected": -2.320728302001953, + "logps/chosen": -169.90667724609375, + "logps/rejected": -198.46682739257812, + "loss": 0.6227, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -1.1688696146011353, + "rewards/margins": 0.2796100974082947, + "rewards/rejected": -1.4484796524047852, + "step": 12140 + }, + { + "epoch": 2.093383873190903, + "grad_norm": 58.29833984375, + "learning_rate": 2.5336422235993403e-08, + "logits/chosen": -2.3326809406280518, + "logits/rejected": -2.3147945404052734, + "logps/chosen": -169.92568969726562, + "logps/rejected": -201.376708984375, + "loss": 0.6059, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -1.1544063091278076, + "rewards/margins": 0.33958542346954346, + "rewards/rejected": -1.4939919710159302, + "step": 12150 + }, + { + "epoch": 2.095106822880772, + "grad_norm": 60.696189880371094, + "learning_rate": 2.5249273580812346e-08, + "logits/chosen": -2.2743961811065674, + "logits/rejected": -2.2431156635284424, + "logps/chosen": -170.11212158203125, + "logps/rejected": -200.20468139648438, + "loss": 0.5955, + "rewards/accuracies": 0.71875, + "rewards/chosen": -1.1341893672943115, + "rewards/margins": 0.3392500877380371, + "rewards/rejected": -1.4734394550323486, + "step": 12160 + }, + { + "epoch": 2.096829772570641, + "grad_norm": 35.26255798339844, + "learning_rate": 2.5162224408787874e-08, + "logits/chosen": -2.2966880798339844, + "logits/rejected": -2.274604320526123, + "logps/chosen": -179.924560546875, + "logps/rejected": -213.06930541992188, + "loss": 0.6118, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -1.2429232597351074, + "rewards/margins": 0.3358061611652374, + "rewards/rejected": -1.5787293910980225, + "step": 12170 + }, + { + "epoch": 2.09855272226051, + "grad_norm": 27.878921508789062, + "learning_rate": 2.5075275069805646e-08, + "logits/chosen": -2.232279062271118, + "logits/rejected": -2.2205493450164795, + "logps/chosen": -175.306884765625, + "logps/rejected": -222.40975952148438, + "loss": 0.5734, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -1.2602647542953491, + "rewards/margins": 0.3941713571548462, + "rewards/rejected": -1.6544361114501953, + "step": 12180 + }, + { + "epoch": 2.1002756719503792, + "grad_norm": 34.53485870361328, + "learning_rate": 2.4988425913350192e-08, + "logits/chosen": -2.2715885639190674, + "logits/rejected": -2.239790439605713, + "logps/chosen": -181.70046997070312, + "logps/rejected": -204.890869140625, + "loss": 0.6273, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -1.2391859292984009, + "rewards/margins": 0.28382736444473267, + "rewards/rejected": -1.5230133533477783, + "step": 12190 + }, + { + "epoch": 2.101998621640248, + "grad_norm": 38.71670913696289, + "learning_rate": 2.4901677288503326e-08, + "logits/chosen": -2.347888469696045, + "logits/rejected": -2.324063777923584, + "logps/chosen": -184.92343139648438, + "logps/rejected": -215.3417510986328, + "loss": 0.596, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -1.2747955322265625, + "rewards/margins": 0.3419981598854065, + "rewards/rejected": -1.6167936325073242, + "step": 12200 + }, + { + "epoch": 2.103721571330117, + "grad_norm": 30.0412540435791, + "learning_rate": 2.4815029543942735e-08, + "logits/chosen": -2.303335189819336, + "logits/rejected": -2.2876079082489014, + "logps/chosen": -183.39291381835938, + "logps/rejected": -210.96267700195312, + "loss": 0.6119, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -1.284551978111267, + "rewards/margins": 0.3175322413444519, + "rewards/rejected": -1.6020843982696533, + "step": 12210 + }, + { + "epoch": 2.105444521019986, + "grad_norm": 41.832977294921875, + "learning_rate": 2.4728483027940715e-08, + "logits/chosen": -2.237399101257324, + "logits/rejected": -2.2207062244415283, + "logps/chosen": -185.9912109375, + "logps/rejected": -210.3903045654297, + "loss": 0.6351, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -1.323387622833252, + "rewards/margins": 0.25132933259010315, + "rewards/rejected": -1.5747170448303223, + "step": 12220 + }, + { + "epoch": 2.107167470709855, + "grad_norm": 28.826099395751953, + "learning_rate": 2.4642038088362595e-08, + "logits/chosen": -2.3004941940307617, + "logits/rejected": -2.2839975357055664, + "logps/chosen": -180.2684326171875, + "logps/rejected": -212.25131225585938, + "loss": 0.6004, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -1.275651216506958, + "rewards/margins": 0.3466276526451111, + "rewards/rejected": -1.6222788095474243, + "step": 12230 + }, + { + "epoch": 2.1088904203997245, + "grad_norm": 34.61656951904297, + "learning_rate": 2.4555695072665494e-08, + "logits/chosen": -2.231196403503418, + "logits/rejected": -2.204634189605713, + "logps/chosen": -178.1566619873047, + "logps/rejected": -210.77017211914062, + "loss": 0.5946, + "rewards/accuracies": 0.71875, + "rewards/chosen": -1.2443420886993408, + "rewards/margins": 0.34749799966812134, + "rewards/rejected": -1.5918400287628174, + "step": 12240 + }, + { + "epoch": 2.1106133700895935, + "grad_norm": 41.588321685791016, + "learning_rate": 2.446945432789681e-08, + "logits/chosen": -2.287522315979004, + "logits/rejected": -2.264913558959961, + "logps/chosen": -176.79226684570312, + "logps/rejected": -197.31668090820312, + "loss": 0.6499, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -1.2354786396026611, + "rewards/margins": 0.23657634854316711, + "rewards/rejected": -1.4720548391342163, + "step": 12250 + }, + { + "epoch": 2.1123363197794625, + "grad_norm": 37.527713775634766, + "learning_rate": 2.4383316200692928e-08, + "logits/chosen": -2.2929043769836426, + "logits/rejected": -2.2686684131622314, + "logps/chosen": -169.481689453125, + "logps/rejected": -199.8762969970703, + "loss": 0.6155, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -1.1590509414672852, + "rewards/margins": 0.29075926542282104, + "rewards/rejected": -1.449810266494751, + "step": 12260 + }, + { + "epoch": 2.1140592694693314, + "grad_norm": 31.186466217041016, + "learning_rate": 2.4297281037277694e-08, + "logits/chosen": -2.366913318634033, + "logits/rejected": -2.339359760284424, + "logps/chosen": -176.31210327148438, + "logps/rejected": -207.9870147705078, + "loss": 0.6041, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -1.203351378440857, + "rewards/margins": 0.34722915291786194, + "rewards/rejected": -1.550580382347107, + "step": 12270 + }, + { + "epoch": 2.1157822191592004, + "grad_norm": 27.913915634155273, + "learning_rate": 2.4211349183461195e-08, + "logits/chosen": -2.3033981323242188, + "logits/rejected": -2.2841343879699707, + "logps/chosen": -177.34950256347656, + "logps/rejected": -204.71127319335938, + "loss": 0.6015, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -1.1983354091644287, + "rewards/margins": 0.3280332684516907, + "rewards/rejected": -1.5263686180114746, + "step": 12280 + }, + { + "epoch": 2.11750516884907, + "grad_norm": 34.77854537963867, + "learning_rate": 2.4125520984638177e-08, + "logits/chosen": -2.2566428184509277, + "logits/rejected": -2.228959083557129, + "logps/chosen": -173.9934539794922, + "logps/rejected": -200.5246124267578, + "loss": 0.6157, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -1.1927902698516846, + "rewards/margins": 0.2847309112548828, + "rewards/rejected": -1.4775211811065674, + "step": 12290 + }, + { + "epoch": 2.1192281185389388, + "grad_norm": 34.15695571899414, + "learning_rate": 2.4039796785786827e-08, + "logits/chosen": -2.311659574508667, + "logits/rejected": -2.281036853790283, + "logps/chosen": -180.02626037597656, + "logps/rejected": -207.08236694335938, + "loss": 0.6036, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -1.2438709735870361, + "rewards/margins": 0.30703234672546387, + "rewards/rejected": -1.5509033203125, + "step": 12300 + }, + { + "epoch": 2.1209510682288077, + "grad_norm": 25.6216983795166, + "learning_rate": 2.3954176931467323e-08, + "logits/chosen": -2.286026954650879, + "logits/rejected": -2.2513835430145264, + "logps/chosen": -173.94776916503906, + "logps/rejected": -204.9058380126953, + "loss": 0.5921, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -1.1738064289093018, + "rewards/margins": 0.35794585943222046, + "rewards/rejected": -1.531752347946167, + "step": 12310 + }, + { + "epoch": 2.1226740179186767, + "grad_norm": 37.73752212524414, + "learning_rate": 2.3868661765820346e-08, + "logits/chosen": -2.2825002670288086, + "logits/rejected": -2.252878189086914, + "logps/chosen": -170.19056701660156, + "logps/rejected": -213.87472534179688, + "loss": 0.5703, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -1.144214391708374, + "rewards/margins": 0.44874677062034607, + "rewards/rejected": -1.592961311340332, + "step": 12320 + }, + { + "epoch": 2.1243969676085457, + "grad_norm": 26.66788101196289, + "learning_rate": 2.3783251632565875e-08, + "logits/chosen": -2.3073744773864746, + "logits/rejected": -2.2946484088897705, + "logps/chosen": -176.7238006591797, + "logps/rejected": -203.3588409423828, + "loss": 0.6013, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -1.1875582933425903, + "rewards/margins": 0.3035629093647003, + "rewards/rejected": -1.4911211729049683, + "step": 12330 + }, + { + "epoch": 2.126119917298415, + "grad_norm": 32.90800476074219, + "learning_rate": 2.3697946875001725e-08, + "logits/chosen": -2.3109140396118164, + "logits/rejected": -2.280402898788452, + "logps/chosen": -181.58554077148438, + "logps/rejected": -217.5074005126953, + "loss": 0.5762, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -1.265123963356018, + "rewards/margins": 0.37893742322921753, + "rewards/rejected": -1.6440613269805908, + "step": 12340 + }, + { + "epoch": 2.127842866988284, + "grad_norm": 30.612865447998047, + "learning_rate": 2.3612747836002116e-08, + "logits/chosen": -2.231553316116333, + "logits/rejected": -2.1880717277526855, + "logps/chosen": -187.61593627929688, + "logps/rejected": -221.2359619140625, + "loss": 0.58, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -1.3059110641479492, + "rewards/margins": 0.400498628616333, + "rewards/rejected": -1.7064098119735718, + "step": 12350 + }, + { + "epoch": 2.129565816678153, + "grad_norm": 33.540550231933594, + "learning_rate": 2.352765485801635e-08, + "logits/chosen": -2.3030381202697754, + "logits/rejected": -2.283235549926758, + "logps/chosen": -172.56826782226562, + "logps/rejected": -210.2171630859375, + "loss": 0.5728, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.1839771270751953, + "rewards/margins": 0.40479379892349243, + "rewards/rejected": -1.5887707471847534, + "step": 12360 + }, + { + "epoch": 2.131288766368022, + "grad_norm": 32.70058059692383, + "learning_rate": 2.3442668283067453e-08, + "logits/chosen": -2.292982578277588, + "logits/rejected": -2.2578206062316895, + "logps/chosen": -180.56468200683594, + "logps/rejected": -218.25924682617188, + "loss": 0.5707, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -1.2692445516586304, + "rewards/margins": 0.4212293028831482, + "rewards/rejected": -1.6904739141464233, + "step": 12370 + }, + { + "epoch": 2.133011716057891, + "grad_norm": 54.63309097290039, + "learning_rate": 2.335778845275079e-08, + "logits/chosen": -2.2785980701446533, + "logits/rejected": -2.274125576019287, + "logps/chosen": -180.288818359375, + "logps/rejected": -222.59780883789062, + "loss": 0.5769, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -1.2888532876968384, + "rewards/margins": 0.3815760314464569, + "rewards/rejected": -1.6704292297363281, + "step": 12380 + }, + { + "epoch": 2.13473466574776, + "grad_norm": 38.83860397338867, + "learning_rate": 2.32730157082326e-08, + "logits/chosen": -2.3655247688293457, + "logits/rejected": -2.349914073944092, + "logps/chosen": -183.9013214111328, + "logps/rejected": -226.04238891601562, + "loss": 0.5715, + "rewards/accuracies": 0.71875, + "rewards/chosen": -1.296132206916809, + "rewards/margins": 0.42857784032821655, + "rewards/rejected": -1.7247101068496704, + "step": 12390 + }, + { + "epoch": 2.1364576154376294, + "grad_norm": 28.777034759521484, + "learning_rate": 2.3188350390248796e-08, + "logits/chosen": -2.28861665725708, + "logits/rejected": -2.2760109901428223, + "logps/chosen": -195.12936401367188, + "logps/rejected": -231.7972869873047, + "loss": 0.5808, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -1.390777826309204, + "rewards/margins": 0.3694989085197449, + "rewards/rejected": -1.7602765560150146, + "step": 12400 + }, + { + "epoch": 2.1364576154376294, + "eval_logits/chosen": -2.350759744644165, + "eval_logits/rejected": -2.3382339477539062, + "eval_logps/chosen": -178.0778350830078, + "eval_logps/rejected": -199.925537109375, + "eval_loss": 0.6489568948745728, + "eval_rewards/accuracies": 0.6038568615913391, + "eval_rewards/chosen": -1.1906236410140991, + "eval_rewards/margins": 0.1811356097459793, + "eval_rewards/rejected": -1.3717591762542725, + "eval_runtime": 384.8916, + "eval_samples_per_second": 11.182, + "eval_steps_per_second": 1.398, + "step": 12400 + }, + { + "epoch": 2.1381805651274983, + "grad_norm": 38.66655349731445, + "learning_rate": 2.310379283910343e-08, + "logits/chosen": -2.267749309539795, + "logits/rejected": -2.2331278324127197, + "logps/chosen": -186.0257568359375, + "logps/rejected": -220.5785675048828, + "loss": 0.5827, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -1.3094804286956787, + "rewards/margins": 0.37783220410346985, + "rewards/rejected": -1.6873127222061157, + "step": 12410 + }, + { + "epoch": 2.1399035148173673, + "grad_norm": 56.3077392578125, + "learning_rate": 2.30193433946674e-08, + "logits/chosen": -2.2084813117980957, + "logits/rejected": -2.182887554168701, + "logps/chosen": -195.47129821777344, + "logps/rejected": -227.36257934570312, + "loss": 0.6132, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -1.3920471668243408, + "rewards/margins": 0.3612838089466095, + "rewards/rejected": -1.753330945968628, + "step": 12420 + }, + { + "epoch": 2.1416264645072363, + "grad_norm": 29.597278594970703, + "learning_rate": 2.2935002396377128e-08, + "logits/chosen": -2.2616899013519287, + "logits/rejected": -2.2430830001831055, + "logps/chosen": -196.7481231689453, + "logps/rejected": -226.3641815185547, + "loss": 0.6206, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -1.4068971872329712, + "rewards/margins": 0.32016709446907043, + "rewards/rejected": -1.7270641326904297, + "step": 12430 + }, + { + "epoch": 2.1433494141971057, + "grad_norm": 33.88653564453125, + "learning_rate": 2.2850770183233125e-08, + "logits/chosen": -2.235628843307495, + "logits/rejected": -2.2164924144744873, + "logps/chosen": -185.83493041992188, + "logps/rejected": -216.488037109375, + "loss": 0.605, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -1.3134291172027588, + "rewards/margins": 0.33438539505004883, + "rewards/rejected": -1.6478145122528076, + "step": 12440 + }, + { + "epoch": 2.1450723638869746, + "grad_norm": 36.81434631347656, + "learning_rate": 2.276664709379863e-08, + "logits/chosen": -2.273482084274292, + "logits/rejected": -2.2568745613098145, + "logps/chosen": -182.1161651611328, + "logps/rejected": -213.73391723632812, + "loss": 0.6151, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -1.305662989616394, + "rewards/margins": 0.3310778737068176, + "rewards/rejected": -1.6367409229278564, + "step": 12450 + }, + { + "epoch": 2.1467953135768436, + "grad_norm": 39.7001953125, + "learning_rate": 2.2682633466198263e-08, + "logits/chosen": -2.3082103729248047, + "logits/rejected": -2.2860910892486572, + "logps/chosen": -190.76364135742188, + "logps/rejected": -221.93704223632812, + "loss": 0.5958, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -1.3485095500946045, + "rewards/margins": 0.35299503803253174, + "rewards/rejected": -1.7015047073364258, + "step": 12460 + }, + { + "epoch": 2.1485182632667126, + "grad_norm": 44.36844253540039, + "learning_rate": 2.259872963811672e-08, + "logits/chosen": -2.3637163639068604, + "logits/rejected": -2.3345274925231934, + "logps/chosen": -190.12307739257812, + "logps/rejected": -237.3924560546875, + "loss": 0.5544, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -1.3406221866607666, + "rewards/margins": 0.48035627603530884, + "rewards/rejected": -1.8209785223007202, + "step": 12470 + }, + { + "epoch": 2.1502412129565815, + "grad_norm": 32.28123092651367, + "learning_rate": 2.2514935946797347e-08, + "logits/chosen": -2.3527333736419678, + "logits/rejected": -2.334380626678467, + "logps/chosen": -186.5384521484375, + "logps/rejected": -218.5380859375, + "loss": 0.5923, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -1.308751106262207, + "rewards/margins": 0.33788079023361206, + "rewards/rejected": -1.6466318368911743, + "step": 12480 + }, + { + "epoch": 2.1519641626464505, + "grad_norm": 45.380977630615234, + "learning_rate": 2.2431252729040796e-08, + "logits/chosen": -2.248897075653076, + "logits/rejected": -2.225717067718506, + "logps/chosen": -190.13870239257812, + "logps/rejected": -228.26266479492188, + "loss": 0.5985, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -1.3422235250473022, + "rewards/margins": 0.4081028997898102, + "rewards/rejected": -1.7503265142440796, + "step": 12490 + }, + { + "epoch": 2.15368711233632, + "grad_norm": 44.201690673828125, + "learning_rate": 2.2347680321203655e-08, + "logits/chosen": -2.2898340225219727, + "logits/rejected": -2.266667366027832, + "logps/chosen": -193.89730834960938, + "logps/rejected": -227.7718963623047, + "loss": 0.6083, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -1.3940218687057495, + "rewards/margins": 0.3586365580558777, + "rewards/rejected": -1.7526586055755615, + "step": 12500 + }, + { + "epoch": 2.155410062026189, + "grad_norm": 30.75022315979004, + "learning_rate": 2.2264219059197174e-08, + "logits/chosen": -2.3246655464172363, + "logits/rejected": -2.3050475120544434, + "logps/chosen": -183.81161499023438, + "logps/rejected": -209.09164428710938, + "loss": 0.6141, + "rewards/accuracies": 0.65625, + "rewards/chosen": -1.2971190214157104, + "rewards/margins": 0.28229445219039917, + "rewards/rejected": -1.5794134140014648, + "step": 12510 + }, + { + "epoch": 2.157133011716058, + "grad_norm": 37.722503662109375, + "learning_rate": 2.218086927848587e-08, + "logits/chosen": -2.2650203704833984, + "logits/rejected": -2.2433574199676514, + "logps/chosen": -184.23312377929688, + "logps/rejected": -210.4525909423828, + "loss": 0.6266, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -1.3043887615203857, + "rewards/margins": 0.26472169160842896, + "rewards/rejected": -1.5691105127334595, + "step": 12520 + }, + { + "epoch": 2.158855961405927, + "grad_norm": 34.342227935791016, + "learning_rate": 2.2097631314086112e-08, + "logits/chosen": -2.2892611026763916, + "logits/rejected": -2.2736310958862305, + "logps/chosen": -183.8441925048828, + "logps/rejected": -217.6746826171875, + "loss": 0.594, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -1.287831425666809, + "rewards/margins": 0.3383808732032776, + "rewards/rejected": -1.6262121200561523, + "step": 12530 + }, + { + "epoch": 2.160578911095796, + "grad_norm": 42.98286056518555, + "learning_rate": 2.201450550056486e-08, + "logits/chosen": -2.2905638217926025, + "logits/rejected": -2.264129161834717, + "logps/chosen": -180.79383850097656, + "logps/rejected": -211.2161865234375, + "loss": 0.6188, + "rewards/accuracies": 0.65625, + "rewards/chosen": -1.2625057697296143, + "rewards/margins": 0.31720057129859924, + "rewards/rejected": -1.5797064304351807, + "step": 12540 + }, + { + "epoch": 2.162301860785665, + "grad_norm": 52.744972229003906, + "learning_rate": 2.193149217203833e-08, + "logits/chosen": -2.370565891265869, + "logits/rejected": -2.3550562858581543, + "logps/chosen": -172.3019561767578, + "logps/rejected": -199.21258544921875, + "loss": 0.6206, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -1.1986303329467773, + "rewards/margins": 0.2771567702293396, + "rewards/rejected": -1.4757869243621826, + "step": 12550 + }, + { + "epoch": 2.164024810475534, + "grad_norm": 31.21704864501953, + "learning_rate": 2.1848591662170546e-08, + "logits/chosen": -2.317023515701294, + "logits/rejected": -2.283731460571289, + "logps/chosen": -181.77279663085938, + "logps/rejected": -203.51612854003906, + "loss": 0.6246, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -1.2345434427261353, + "rewards/margins": 0.3114972710609436, + "rewards/rejected": -1.5460407733917236, + "step": 12560 + }, + { + "epoch": 2.165747760165403, + "grad_norm": 38.83090591430664, + "learning_rate": 2.1765804304172137e-08, + "logits/chosen": -2.290691375732422, + "logits/rejected": -2.2567405700683594, + "logps/chosen": -168.33099365234375, + "logps/rejected": -199.4054412841797, + "loss": 0.5832, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -1.1442289352416992, + "rewards/margins": 0.34920820593833923, + "rewards/rejected": -1.4934370517730713, + "step": 12570 + }, + { + "epoch": 2.167470709855272, + "grad_norm": 33.23080825805664, + "learning_rate": 2.1683130430798907e-08, + "logits/chosen": -2.3136255741119385, + "logits/rejected": -2.272890567779541, + "logps/chosen": -178.71817016601562, + "logps/rejected": -206.2785186767578, + "loss": 0.5737, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.1674859523773193, + "rewards/margins": 0.3780251145362854, + "rewards/rejected": -1.54551100730896, + "step": 12580 + }, + { + "epoch": 2.169193659545141, + "grad_norm": 37.45336151123047, + "learning_rate": 2.16005703743505e-08, + "logits/chosen": -2.2911033630371094, + "logits/rejected": -2.2649600505828857, + "logps/chosen": -172.32638549804688, + "logps/rejected": -202.3101043701172, + "loss": 0.5868, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -1.1516131162643433, + "rewards/margins": 0.34974223375320435, + "rewards/rejected": -1.5013554096221924, + "step": 12590 + }, + { + "epoch": 2.1709166092350105, + "grad_norm": 31.608232498168945, + "learning_rate": 2.151812446666908e-08, + "logits/chosen": -2.2943625450134277, + "logits/rejected": -2.2727608680725098, + "logps/chosen": -184.67996215820312, + "logps/rejected": -205.7906951904297, + "loss": 0.6199, + "rewards/accuracies": 0.6187499761581421, + "rewards/chosen": -1.2513759136199951, + "rewards/margins": 0.284992516040802, + "rewards/rejected": -1.5363683700561523, + "step": 12600 + }, + { + "epoch": 2.1726395589248795, + "grad_norm": 26.701438903808594, + "learning_rate": 2.1435793039138035e-08, + "logits/chosen": -2.3679873943328857, + "logits/rejected": -2.358001232147217, + "logps/chosen": -173.2816619873047, + "logps/rejected": -207.9984588623047, + "loss": 0.5833, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.1841771602630615, + "rewards/margins": 0.359028160572052, + "rewards/rejected": -1.5432054996490479, + "step": 12610 + }, + { + "epoch": 2.1743625086147484, + "grad_norm": 42.2379035949707, + "learning_rate": 2.135357642268062e-08, + "logits/chosen": -2.402472734451294, + "logits/rejected": -2.3818836212158203, + "logps/chosen": -188.36465454101562, + "logps/rejected": -212.6293182373047, + "loss": 0.6238, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -1.3277316093444824, + "rewards/margins": 0.27842921018600464, + "rewards/rejected": -1.6061607599258423, + "step": 12620 + }, + { + "epoch": 2.1760854583046174, + "grad_norm": 35.1007194519043, + "learning_rate": 2.1271474947758533e-08, + "logits/chosen": -2.351163864135742, + "logits/rejected": -2.3398239612579346, + "logps/chosen": -172.6517333984375, + "logps/rejected": -201.7603759765625, + "loss": 0.6291, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -1.2183504104614258, + "rewards/margins": 0.28705310821533203, + "rewards/rejected": -1.5054035186767578, + "step": 12630 + }, + { + "epoch": 2.1778084079944864, + "grad_norm": 38.27680587768555, + "learning_rate": 2.1189488944370753e-08, + "logits/chosen": -2.348966598510742, + "logits/rejected": -2.323958158493042, + "logps/chosen": -168.53770446777344, + "logps/rejected": -202.28419494628906, + "loss": 0.5809, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -1.131299614906311, + "rewards/margins": 0.3630850613117218, + "rewards/rejected": -1.4943846464157104, + "step": 12640 + }, + { + "epoch": 2.179531357684356, + "grad_norm": 50.590946197509766, + "learning_rate": 2.110761874205214e-08, + "logits/chosen": -2.30483078956604, + "logits/rejected": -2.2819137573242188, + "logps/chosen": -166.08322143554688, + "logps/rejected": -191.8925018310547, + "loss": 0.6195, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -1.1224647760391235, + "rewards/margins": 0.28845423460006714, + "rewards/rejected": -1.4109190702438354, + "step": 12650 + }, + { + "epoch": 2.1812543073742248, + "grad_norm": 34.77700424194336, + "learning_rate": 2.1025864669872028e-08, + "logits/chosen": -2.2371766567230225, + "logits/rejected": -2.2177200317382812, + "logps/chosen": -180.53785705566406, + "logps/rejected": -207.4517822265625, + "loss": 0.6437, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -1.2753950357437134, + "rewards/margins": 0.27624499797821045, + "rewards/rejected": -1.5516400337219238, + "step": 12660 + }, + { + "epoch": 2.1829772570640937, + "grad_norm": 34.8372688293457, + "learning_rate": 2.0944227056433062e-08, + "logits/chosen": -2.4396395683288574, + "logits/rejected": -2.395169496536255, + "logps/chosen": -169.12814331054688, + "logps/rejected": -204.72259521484375, + "loss": 0.5752, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -1.1491328477859497, + "rewards/margins": 0.3878241777420044, + "rewards/rejected": -1.5369569063186646, + "step": 12670 + }, + { + "epoch": 2.1847002067539627, + "grad_norm": 30.4854679107666, + "learning_rate": 2.0862706229869716e-08, + "logits/chosen": -2.276838541030884, + "logits/rejected": -2.249408006668091, + "logps/chosen": -174.7004852294922, + "logps/rejected": -213.88442993164062, + "loss": 0.5698, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -1.2216650247573853, + "rewards/margins": 0.407600075006485, + "rewards/rejected": -1.6292650699615479, + "step": 12680 + }, + { + "epoch": 2.1864231564438317, + "grad_norm": 33.259063720703125, + "learning_rate": 2.0781302517847115e-08, + "logits/chosen": -2.2541377544403076, + "logits/rejected": -2.2349231243133545, + "logps/chosen": -174.60008239746094, + "logps/rejected": -207.94424438476562, + "loss": 0.6085, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -1.2174088954925537, + "rewards/margins": 0.3410707116127014, + "rewards/rejected": -1.5584796667099, + "step": 12690 + }, + { + "epoch": 2.188146106133701, + "grad_norm": 30.73868179321289, + "learning_rate": 2.0700016247559592e-08, + "logits/chosen": -2.2638649940490723, + "logits/rejected": -2.23850679397583, + "logps/chosen": -183.5129852294922, + "logps/rejected": -213.8329620361328, + "loss": 0.5905, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -1.2510440349578857, + "rewards/margins": 0.35103440284729004, + "rewards/rejected": -1.6020784378051758, + "step": 12700 + }, + { + "epoch": 2.18986905582357, + "grad_norm": 32.50075149536133, + "learning_rate": 2.0618847745729506e-08, + "logits/chosen": -2.3324215412139893, + "logits/rejected": -2.3214612007141113, + "logps/chosen": -179.8836669921875, + "logps/rejected": -221.8270263671875, + "loss": 0.5777, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -1.2533272504806519, + "rewards/margins": 0.3927740454673767, + "rewards/rejected": -1.6461012363433838, + "step": 12710 + }, + { + "epoch": 2.191592005513439, + "grad_norm": 30.249420166015625, + "learning_rate": 2.05377973386058e-08, + "logits/chosen": -2.3358154296875, + "logits/rejected": -2.295535087585449, + "logps/chosen": -180.45872497558594, + "logps/rejected": -215.7207489013672, + "loss": 0.5581, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -1.2281229496002197, + "rewards/margins": 0.424754798412323, + "rewards/rejected": -1.6528778076171875, + "step": 12720 + }, + { + "epoch": 2.193314955203308, + "grad_norm": 38.64509963989258, + "learning_rate": 2.0456865351962742e-08, + "logits/chosen": -2.259183883666992, + "logits/rejected": -2.230043411254883, + "logps/chosen": -179.16160583496094, + "logps/rejected": -213.3286895751953, + "loss": 0.5972, + "rewards/accuracies": 0.71875, + "rewards/chosen": -1.2795768976211548, + "rewards/margins": 0.3571908175945282, + "rewards/rejected": -1.6367677450180054, + "step": 12730 + }, + { + "epoch": 2.195037904893177, + "grad_norm": 34.17341995239258, + "learning_rate": 2.037605211109866e-08, + "logits/chosen": -2.2671656608581543, + "logits/rejected": -2.246932029724121, + "logps/chosen": -193.5773468017578, + "logps/rejected": -230.35049438476562, + "loss": 0.5922, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -1.408772349357605, + "rewards/margins": 0.38163310289382935, + "rewards/rejected": -1.790405511856079, + "step": 12740 + }, + { + "epoch": 2.1967608545830464, + "grad_norm": 30.60337257385254, + "learning_rate": 2.0295357940834605e-08, + "logits/chosen": -2.26202392578125, + "logits/rejected": -2.232573986053467, + "logps/chosen": -182.06118774414062, + "logps/rejected": -218.7804412841797, + "loss": 0.5819, + "rewards/accuracies": 0.65625, + "rewards/chosen": -1.2679184675216675, + "rewards/margins": 0.38135385513305664, + "rewards/rejected": -1.6492723226547241, + "step": 12750 + }, + { + "epoch": 2.1984838042729153, + "grad_norm": 42.23899841308594, + "learning_rate": 2.0214783165512984e-08, + "logits/chosen": -2.2580082416534424, + "logits/rejected": -2.240325450897217, + "logps/chosen": -186.8460693359375, + "logps/rejected": -219.8138427734375, + "loss": 0.6256, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -1.3406201601028442, + "rewards/margins": 0.3337910771369934, + "rewards/rejected": -1.6744110584259033, + "step": 12760 + }, + { + "epoch": 2.2002067539627843, + "grad_norm": 40.34876251220703, + "learning_rate": 2.0134328108996308e-08, + "logits/chosen": -2.3470633029937744, + "logits/rejected": -2.312948226928711, + "logps/chosen": -186.7422637939453, + "logps/rejected": -210.3963623046875, + "loss": 0.6185, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -1.2813963890075684, + "rewards/margins": 0.30719074606895447, + "rewards/rejected": -1.5885872840881348, + "step": 12770 + }, + { + "epoch": 2.2019297036526533, + "grad_norm": 30.002656936645508, + "learning_rate": 2.0053993094665937e-08, + "logits/chosen": -2.3167033195495605, + "logits/rejected": -2.2913691997528076, + "logps/chosen": -187.4027557373047, + "logps/rejected": -219.14956665039062, + "loss": 0.6183, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -1.3514249324798584, + "rewards/margins": 0.31111449003219604, + "rewards/rejected": -1.6625392436981201, + "step": 12780 + }, + { + "epoch": 2.2036526533425222, + "grad_norm": 37.465084075927734, + "learning_rate": 1.9973778445420732e-08, + "logits/chosen": -2.2503247261047363, + "logits/rejected": -2.2288479804992676, + "logps/chosen": -192.74766540527344, + "logps/rejected": -226.4638214111328, + "loss": 0.5781, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -1.3739817142486572, + "rewards/margins": 0.36986181139945984, + "rewards/rejected": -1.74384343624115, + "step": 12790 + }, + { + "epoch": 2.205375603032391, + "grad_norm": 40.986148834228516, + "learning_rate": 1.9893684483675706e-08, + "logits/chosen": -2.309469223022461, + "logits/rejected": -2.2865958213806152, + "logps/chosen": -182.12448120117188, + "logps/rejected": -213.16604614257812, + "loss": 0.6051, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -1.2606197595596313, + "rewards/margins": 0.3188409209251404, + "rewards/rejected": -1.5794607400894165, + "step": 12800 + }, + { + "epoch": 2.205375603032391, + "eval_logits/chosen": -2.365828037261963, + "eval_logits/rejected": -2.354224681854248, + "eval_logps/chosen": -168.60400390625, + "eval_logps/rejected": -189.23008728027344, + "eval_loss": 0.6495684385299683, + "eval_rewards/accuracies": 0.6052509546279907, + "eval_rewards/chosen": -1.0958852767944336, + "eval_rewards/margins": 0.16891968250274658, + "eval_rewards/rejected": -1.2648048400878906, + "eval_runtime": 384.7328, + "eval_samples_per_second": 11.187, + "eval_steps_per_second": 1.398, + "step": 12800 + }, + { + "epoch": 2.2070985527222606, + "grad_norm": 39.08464431762695, + "learning_rate": 1.98137115313608e-08, + "logits/chosen": -2.3342087268829346, + "logits/rejected": -2.3211050033569336, + "logps/chosen": -181.80323791503906, + "logps/rejected": -210.1888427734375, + "loss": 0.6366, + "rewards/accuracies": 0.65625, + "rewards/chosen": -1.306378960609436, + "rewards/margins": 0.2542952299118042, + "rewards/rejected": -1.5606739521026611, + "step": 12810 + }, + { + "epoch": 2.2088215024121296, + "grad_norm": 48.59690856933594, + "learning_rate": 1.9733859909919593e-08, + "logits/chosen": -2.241438627243042, + "logits/rejected": -2.230583906173706, + "logps/chosen": -178.1573486328125, + "logps/rejected": -205.2084503173828, + "loss": 0.6242, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -1.2335833311080933, + "rewards/margins": 0.2759568989276886, + "rewards/rejected": -1.509540319442749, + "step": 12820 + }, + { + "epoch": 2.2105444521019986, + "grad_norm": 34.11722946166992, + "learning_rate": 1.9654129940307994e-08, + "logits/chosen": -2.2492871284484863, + "logits/rejected": -2.230903148651123, + "logps/chosen": -181.12081909179688, + "logps/rejected": -205.6280059814453, + "loss": 0.6323, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -1.297506332397461, + "rewards/margins": 0.24382254481315613, + "rewards/rejected": -1.54132878780365, + "step": 12830 + }, + { + "epoch": 2.2122674017918675, + "grad_norm": 35.33450698852539, + "learning_rate": 1.9574521942992884e-08, + "logits/chosen": -2.348066806793213, + "logits/rejected": -2.331679582595825, + "logps/chosen": -180.0199432373047, + "logps/rejected": -214.4061279296875, + "loss": 0.5888, + "rewards/accuracies": 0.71875, + "rewards/chosen": -1.2399193048477173, + "rewards/margins": 0.3646962642669678, + "rewards/rejected": -1.604615569114685, + "step": 12840 + }, + { + "epoch": 2.213990351481737, + "grad_norm": 30.569448471069336, + "learning_rate": 1.9495036237950956e-08, + "logits/chosen": -2.342040538787842, + "logits/rejected": -2.307283401489258, + "logps/chosen": -178.75552368164062, + "logps/rejected": -211.24728393554688, + "loss": 0.5804, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -1.2484490871429443, + "rewards/margins": 0.3602822721004486, + "rewards/rejected": -1.6087315082550049, + "step": 12850 + }, + { + "epoch": 2.215713301171606, + "grad_norm": 29.184051513671875, + "learning_rate": 1.9415673144667326e-08, + "logits/chosen": -2.2426769733428955, + "logits/rejected": -2.216895580291748, + "logps/chosen": -186.26821899414062, + "logps/rejected": -218.3754425048828, + "loss": 0.5877, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -1.2924940586090088, + "rewards/margins": 0.3836742341518402, + "rewards/rejected": -1.676168441772461, + "step": 12860 + }, + { + "epoch": 2.217436250861475, + "grad_norm": 40.87593078613281, + "learning_rate": 1.9336432982134266e-08, + "logits/chosen": -2.2999205589294434, + "logits/rejected": -2.2765324115753174, + "logps/chosen": -176.50772094726562, + "logps/rejected": -205.4868927001953, + "loss": 0.6181, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -1.2287260293960571, + "rewards/margins": 0.32602426409721375, + "rewards/rejected": -1.5547503232955933, + "step": 12870 + }, + { + "epoch": 2.219159200551344, + "grad_norm": 40.17512893676758, + "learning_rate": 1.925731606884998e-08, + "logits/chosen": -2.3098721504211426, + "logits/rejected": -2.290262222290039, + "logps/chosen": -175.09259033203125, + "logps/rejected": -207.22021484375, + "loss": 0.5977, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.1981713771820068, + "rewards/margins": 0.3191829323768616, + "rewards/rejected": -1.5173542499542236, + "step": 12880 + }, + { + "epoch": 2.220882150241213, + "grad_norm": 35.549747467041016, + "learning_rate": 1.9178322722817288e-08, + "logits/chosen": -2.3461060523986816, + "logits/rejected": -2.328920602798462, + "logps/chosen": -179.7633056640625, + "logps/rejected": -200.14736938476562, + "loss": 0.6199, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -1.2108832597732544, + "rewards/margins": 0.2605508267879486, + "rewards/rejected": -1.4714341163635254, + "step": 12890 + }, + { + "epoch": 2.222605099931082, + "grad_norm": 35.350547790527344, + "learning_rate": 1.9099453261542297e-08, + "logits/chosen": -2.3254337310791016, + "logits/rejected": -2.2910609245300293, + "logps/chosen": -179.11000061035156, + "logps/rejected": -216.47085571289062, + "loss": 0.5762, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -1.2380391359329224, + "rewards/margins": 0.3910645842552185, + "rewards/rejected": -1.629103660583496, + "step": 12900 + }, + { + "epoch": 2.224328049620951, + "grad_norm": 36.8025016784668, + "learning_rate": 1.9020708002033182e-08, + "logits/chosen": -2.3482630252838135, + "logits/rejected": -2.33329439163208, + "logps/chosen": -171.5440216064453, + "logps/rejected": -199.91836547851562, + "loss": 0.6273, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -1.196648120880127, + "rewards/margins": 0.25569668412208557, + "rewards/rejected": -1.4523446559906006, + "step": 12910 + }, + { + "epoch": 2.22605099931082, + "grad_norm": 27.304306030273438, + "learning_rate": 1.8942087260798933e-08, + "logits/chosen": -2.3011980056762695, + "logits/rejected": -2.278656482696533, + "logps/chosen": -168.2295379638672, + "logps/rejected": -209.52566528320312, + "loss": 0.5754, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -1.1450660228729248, + "rewards/margins": 0.4235553741455078, + "rewards/rejected": -1.5686213970184326, + "step": 12920 + }, + { + "epoch": 2.227773949000689, + "grad_norm": 33.451446533203125, + "learning_rate": 1.886359135384805e-08, + "logits/chosen": -2.3403878211975098, + "logits/rejected": -2.325918436050415, + "logps/chosen": -163.4576873779297, + "logps/rejected": -194.09799194335938, + "loss": 0.6186, + "rewards/accuracies": 0.6187499761581421, + "rewards/chosen": -1.1322734355926514, + "rewards/margins": 0.2874290347099304, + "rewards/rejected": -1.4197025299072266, + "step": 12930 + }, + { + "epoch": 2.229496898690558, + "grad_norm": 35.901519775390625, + "learning_rate": 1.8785220596687244e-08, + "logits/chosen": -2.2995190620422363, + "logits/rejected": -2.26780366897583, + "logps/chosen": -173.3542022705078, + "logps/rejected": -198.0942840576172, + "loss": 0.6165, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.200137972831726, + "rewards/margins": 0.29804515838623047, + "rewards/rejected": -1.498183250427246, + "step": 12940 + }, + { + "epoch": 2.231219848380427, + "grad_norm": 64.96479034423828, + "learning_rate": 1.870697530432019e-08, + "logits/chosen": -2.2867963314056396, + "logits/rejected": -2.264647960662842, + "logps/chosen": -173.05960083007812, + "logps/rejected": -212.035888671875, + "loss": 0.5627, + "rewards/accuracies": 0.7562500238418579, + "rewards/chosen": -1.1870536804199219, + "rewards/margins": 0.40245503187179565, + "rewards/rejected": -1.5895087718963623, + "step": 12950 + }, + { + "epoch": 2.2329427980702965, + "grad_norm": 26.18922996520996, + "learning_rate": 1.8628855791246323e-08, + "logits/chosen": -2.25118088722229, + "logits/rejected": -2.2259633541107178, + "logps/chosen": -183.46018981933594, + "logps/rejected": -200.4232635498047, + "loss": 0.6394, + "rewards/accuracies": 0.6187499761581421, + "rewards/chosen": -1.2426159381866455, + "rewards/margins": 0.23426619172096252, + "rewards/rejected": -1.4768823385238647, + "step": 12960 + }, + { + "epoch": 2.2346657477601655, + "grad_norm": 26.887893676757812, + "learning_rate": 1.8550862371459457e-08, + "logits/chosen": -2.2446653842926025, + "logits/rejected": -2.211754322052002, + "logps/chosen": -174.5418701171875, + "logps/rejected": -206.7992401123047, + "loss": 0.5685, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -1.2038919925689697, + "rewards/margins": 0.3909718990325928, + "rewards/rejected": -1.594863772392273, + "step": 12970 + }, + { + "epoch": 2.2363886974500344, + "grad_norm": 30.763185501098633, + "learning_rate": 1.8472995358446646e-08, + "logits/chosen": -2.2639660835266113, + "logits/rejected": -2.248469114303589, + "logps/chosen": -171.14822387695312, + "logps/rejected": -208.5131072998047, + "loss": 0.5737, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": -1.2012474536895752, + "rewards/margins": 0.3625752329826355, + "rewards/rejected": -1.5638227462768555, + "step": 12980 + }, + { + "epoch": 2.2381116471399034, + "grad_norm": 34.40770721435547, + "learning_rate": 1.8395255065186804e-08, + "logits/chosen": -2.3413078784942627, + "logits/rejected": -2.3061013221740723, + "logps/chosen": -182.0183563232422, + "logps/rejected": -212.195068359375, + "loss": 0.5928, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.2216604948043823, + "rewards/margins": 0.37895292043685913, + "rewards/rejected": -1.6006133556365967, + "step": 12990 + }, + { + "epoch": 2.2398345968297724, + "grad_norm": 38.62473678588867, + "learning_rate": 1.8317641804149575e-08, + "logits/chosen": -2.2863516807556152, + "logits/rejected": -2.259194850921631, + "logps/chosen": -183.0014190673828, + "logps/rejected": -215.2476348876953, + "loss": 0.5887, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -1.2855231761932373, + "rewards/margins": 0.3565601110458374, + "rewards/rejected": -1.6420834064483643, + "step": 13000 + }, + { + "epoch": 2.241557546519642, + "grad_norm": 34.64742660522461, + "learning_rate": 1.8240155887293938e-08, + "logits/chosen": -2.2462267875671387, + "logits/rejected": -2.230980396270752, + "logps/chosen": -179.2667694091797, + "logps/rejected": -206.8971710205078, + "loss": 0.6077, + "rewards/accuracies": 0.65625, + "rewards/chosen": -1.2816540002822876, + "rewards/margins": 0.29531174898147583, + "rewards/rejected": -1.5769659280776978, + "step": 13010 + }, + { + "epoch": 2.2432804962095108, + "grad_norm": 25.726648330688477, + "learning_rate": 1.8162797626067072e-08, + "logits/chosen": -2.2447333335876465, + "logits/rejected": -2.2197303771972656, + "logps/chosen": -179.4647979736328, + "logps/rejected": -208.9609832763672, + "loss": 0.6071, + "rewards/accuracies": 0.65625, + "rewards/chosen": -1.2703464031219482, + "rewards/margins": 0.3287833333015442, + "rewards/rejected": -1.5991299152374268, + "step": 13020 + }, + { + "epoch": 2.2450034458993797, + "grad_norm": 44.14027404785156, + "learning_rate": 1.808556733140306e-08, + "logits/chosen": -2.313990831375122, + "logits/rejected": -2.2862823009490967, + "logps/chosen": -183.66726684570312, + "logps/rejected": -209.05703735351562, + "loss": 0.6153, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -1.281714677810669, + "rewards/margins": 0.29137855768203735, + "rewards/rejected": -1.5730931758880615, + "step": 13030 + }, + { + "epoch": 2.2467263955892487, + "grad_norm": 41.62800979614258, + "learning_rate": 1.800846531372161e-08, + "logits/chosen": -2.3211021423339844, + "logits/rejected": -2.297131061553955, + "logps/chosen": -184.17926025390625, + "logps/rejected": -222.2518310546875, + "loss": 0.5852, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -1.2904921770095825, + "rewards/margins": 0.3817223608493805, + "rewards/rejected": -1.6722145080566406, + "step": 13040 + }, + { + "epoch": 2.2484493452791177, + "grad_norm": 37.87771224975586, + "learning_rate": 1.7931491882926813e-08, + "logits/chosen": -2.302258253097534, + "logits/rejected": -2.286159038543701, + "logps/chosen": -183.59597778320312, + "logps/rejected": -221.5272979736328, + "loss": 0.5966, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -1.310205340385437, + "rewards/margins": 0.37293821573257446, + "rewards/rejected": -1.6831436157226562, + "step": 13050 + }, + { + "epoch": 2.250172294968987, + "grad_norm": 35.3230094909668, + "learning_rate": 1.7854647348405993e-08, + "logits/chosen": -2.3094897270202637, + "logits/rejected": -2.2902464866638184, + "logps/chosen": -182.91275024414062, + "logps/rejected": -219.88931274414062, + "loss": 0.5999, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -1.2874906063079834, + "rewards/margins": 0.37509453296661377, + "rewards/rejected": -1.6625852584838867, + "step": 13060 + }, + { + "epoch": 2.251895244658856, + "grad_norm": 38.81179428100586, + "learning_rate": 1.7777932019028314e-08, + "logits/chosen": -2.254732847213745, + "logits/rejected": -2.225248336791992, + "logps/chosen": -178.22152709960938, + "logps/rejected": -203.3961639404297, + "loss": 0.6193, + "rewards/accuracies": 0.59375, + "rewards/chosen": -1.2079050540924072, + "rewards/margins": 0.30005380511283875, + "rewards/rejected": -1.5079588890075684, + "step": 13070 + }, + { + "epoch": 2.253618194348725, + "grad_norm": 31.214067459106445, + "learning_rate": 1.770134620314363e-08, + "logits/chosen": -2.2343270778656006, + "logits/rejected": -2.218808174133301, + "logps/chosen": -180.11376953125, + "logps/rejected": -211.5517120361328, + "loss": 0.6029, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.2803877592086792, + "rewards/margins": 0.3238711953163147, + "rewards/rejected": -1.6042588949203491, + "step": 13080 + }, + { + "epoch": 2.255341144038594, + "grad_norm": 50.3981819152832, + "learning_rate": 1.762489020858125e-08, + "logits/chosen": -2.3294975757598877, + "logits/rejected": -2.2978947162628174, + "logps/chosen": -183.16334533691406, + "logps/rejected": -210.7142791748047, + "loss": 0.6255, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -1.2859891653060913, + "rewards/margins": 0.3077470362186432, + "rewards/rejected": -1.593736171722412, + "step": 13090 + }, + { + "epoch": 2.257064093728463, + "grad_norm": 29.65924644470215, + "learning_rate": 1.754856434264869e-08, + "logits/chosen": -2.3811585903167725, + "logits/rejected": -2.3489012718200684, + "logps/chosen": -179.81199645996094, + "logps/rejected": -212.436279296875, + "loss": 0.5774, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -1.2426912784576416, + "rewards/margins": 0.3862963318824768, + "rewards/rejected": -1.6289876699447632, + "step": 13100 + }, + { + "epoch": 2.2587870434183324, + "grad_norm": 30.150890350341797, + "learning_rate": 1.7472368912130365e-08, + "logits/chosen": -2.2300591468811035, + "logits/rejected": -2.199148416519165, + "logps/chosen": -182.3174285888672, + "logps/rejected": -218.63070678710938, + "loss": 0.5946, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.273079752922058, + "rewards/margins": 0.40432214736938477, + "rewards/rejected": -1.677402138710022, + "step": 13110 + }, + { + "epoch": 2.2605099931082013, + "grad_norm": 26.72989273071289, + "learning_rate": 1.7396304223286484e-08, + "logits/chosen": -2.344831705093384, + "logits/rejected": -2.330641508102417, + "logps/chosen": -178.80715942382812, + "logps/rejected": -210.7775421142578, + "loss": 0.6014, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.2342865467071533, + "rewards/margins": 0.3504003584384918, + "rewards/rejected": -1.5846867561340332, + "step": 13120 + }, + { + "epoch": 2.2622329427980703, + "grad_norm": 29.89188575744629, + "learning_rate": 1.73203705818517e-08, + "logits/chosen": -2.384573221206665, + "logits/rejected": -2.355776309967041, + "logps/chosen": -172.6627960205078, + "logps/rejected": -217.6291046142578, + "loss": 0.5639, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -1.1754767894744873, + "rewards/margins": 0.4282703399658203, + "rewards/rejected": -1.603747010231018, + "step": 13130 + }, + { + "epoch": 2.2639558924879393, + "grad_norm": 29.023595809936523, + "learning_rate": 1.724456829303399e-08, + "logits/chosen": -2.2623140811920166, + "logits/rejected": -2.2416133880615234, + "logps/chosen": -171.00674438476562, + "logps/rejected": -203.25985717773438, + "loss": 0.5916, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.1680914163589478, + "rewards/margins": 0.3259120285511017, + "rewards/rejected": -1.494003415107727, + "step": 13140 + }, + { + "epoch": 2.2656788421778082, + "grad_norm": 38.09290313720703, + "learning_rate": 1.71688976615133e-08, + "logits/chosen": -2.2763171195983887, + "logits/rejected": -2.2502236366271973, + "logps/chosen": -178.1101531982422, + "logps/rejected": -217.0858612060547, + "loss": 0.5706, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -1.2172980308532715, + "rewards/margins": 0.42803043127059937, + "rewards/rejected": -1.6453285217285156, + "step": 13150 + }, + { + "epoch": 2.2674017918676777, + "grad_norm": 41.28289031982422, + "learning_rate": 1.7093358991440466e-08, + "logits/chosen": -2.257770538330078, + "logits/rejected": -2.22393798828125, + "logps/chosen": -187.81979370117188, + "logps/rejected": -221.6351776123047, + "loss": 0.5819, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -1.3062386512756348, + "rewards/margins": 0.3810449242591858, + "rewards/rejected": -1.6872835159301758, + "step": 13160 + }, + { + "epoch": 2.2691247415575466, + "grad_norm": 38.38431167602539, + "learning_rate": 1.7017952586435874e-08, + "logits/chosen": -2.292358875274658, + "logits/rejected": -2.27616286277771, + "logps/chosen": -183.13650512695312, + "logps/rejected": -208.0890655517578, + "loss": 0.6288, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -1.2799513339996338, + "rewards/margins": 0.28485870361328125, + "rewards/rejected": -1.564810037612915, + "step": 13170 + }, + { + "epoch": 2.2708476912474156, + "grad_norm": 36.38185501098633, + "learning_rate": 1.6942678749588263e-08, + "logits/chosen": -2.2948360443115234, + "logits/rejected": -2.2646539211273193, + "logps/chosen": -179.03570556640625, + "logps/rejected": -212.93115234375, + "loss": 0.5728, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -1.2423360347747803, + "rewards/margins": 0.385326623916626, + "rewards/rejected": -1.6276626586914062, + "step": 13180 + }, + { + "epoch": 2.2725706409372846, + "grad_norm": 35.919898986816406, + "learning_rate": 1.686753778345359e-08, + "logits/chosen": -2.2914533615112305, + "logits/rejected": -2.263596534729004, + "logps/chosen": -180.45431518554688, + "logps/rejected": -207.91995239257812, + "loss": 0.6145, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -1.2361723184585571, + "rewards/margins": 0.32427722215652466, + "rewards/rejected": -1.5604493618011475, + "step": 13190 + }, + { + "epoch": 2.2742935906271535, + "grad_norm": 32.69383239746094, + "learning_rate": 1.6792529990053715e-08, + "logits/chosen": -2.278026819229126, + "logits/rejected": -2.2545487880706787, + "logps/chosen": -179.6051483154297, + "logps/rejected": -201.932861328125, + "loss": 0.6223, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.2343283891677856, + "rewards/margins": 0.2624986469745636, + "rewards/rejected": -1.4968270063400269, + "step": 13200 + }, + { + "epoch": 2.2742935906271535, + "eval_logits/chosen": -2.357876777648926, + "eval_logits/rejected": -2.3459715843200684, + "eval_logps/chosen": -167.66600036621094, + "eval_logps/rejected": -188.626708984375, + "eval_loss": 0.6502256989479065, + "eval_rewards/accuracies": 0.606877326965332, + "eval_rewards/chosen": -1.0865050554275513, + "eval_rewards/margins": 0.17226597666740417, + "eval_rewards/rejected": -1.2587710618972778, + "eval_runtime": 384.8668, + "eval_samples_per_second": 11.183, + "eval_steps_per_second": 1.398, + "step": 13200 + }, + { + "epoch": 2.2760165403170225, + "grad_norm": 42.5054931640625, + "learning_rate": 1.671765567087523e-08, + "logits/chosen": -2.3471245765686035, + "logits/rejected": -2.338646650314331, + "logps/chosen": -172.4659881591797, + "logps/rejected": -201.0215606689453, + "loss": 0.6181, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -1.1909699440002441, + "rewards/margins": 0.29259103536605835, + "rewards/rejected": -1.4835610389709473, + "step": 13210 + }, + { + "epoch": 2.277739490006892, + "grad_norm": 27.458099365234375, + "learning_rate": 1.6642915126868203e-08, + "logits/chosen": -2.3157057762145996, + "logits/rejected": -2.2974138259887695, + "logps/chosen": -178.68539428710938, + "logps/rejected": -205.52627563476562, + "loss": 0.6127, + "rewards/accuracies": 0.65625, + "rewards/chosen": -1.22714364528656, + "rewards/margins": 0.27688199281692505, + "rewards/rejected": -1.5040256977081299, + "step": 13220 + }, + { + "epoch": 2.279462439696761, + "grad_norm": 26.042802810668945, + "learning_rate": 1.6568308658445064e-08, + "logits/chosen": -2.2837634086608887, + "logits/rejected": -2.2617480754852295, + "logps/chosen": -168.11532592773438, + "logps/rejected": -207.41650390625, + "loss": 0.5715, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -1.1576555967330933, + "rewards/margins": 0.39275139570236206, + "rewards/rejected": -1.5504071712493896, + "step": 13230 + }, + { + "epoch": 2.28118538938663, + "grad_norm": 37.14672088623047, + "learning_rate": 1.6493836565479324e-08, + "logits/chosen": -2.3002994060516357, + "logits/rejected": -2.2840652465820312, + "logps/chosen": -178.58566284179688, + "logps/rejected": -211.67404174804688, + "loss": 0.5895, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.2686278820037842, + "rewards/margins": 0.3217323422431946, + "rewards/rejected": -1.5903600454330444, + "step": 13240 + }, + { + "epoch": 2.282908339076499, + "grad_norm": 29.27492904663086, + "learning_rate": 1.6419499147304366e-08, + "logits/chosen": -2.264556646347046, + "logits/rejected": -2.253521203994751, + "logps/chosen": -177.2767791748047, + "logps/rejected": -214.045166015625, + "loss": 0.5916, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -1.2435022592544556, + "rewards/margins": 0.34912917017936707, + "rewards/rejected": -1.5926315784454346, + "step": 13250 + }, + { + "epoch": 2.2846312887663682, + "grad_norm": 29.972997665405273, + "learning_rate": 1.634529670271224e-08, + "logits/chosen": -2.3521368503570557, + "logits/rejected": -2.3302011489868164, + "logps/chosen": -179.6995391845703, + "logps/rejected": -224.6154327392578, + "loss": 0.5766, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.3029943704605103, + "rewards/margins": 0.42312970757484436, + "rewards/rejected": -1.7261241674423218, + "step": 13260 + }, + { + "epoch": 2.286354238456237, + "grad_norm": 31.261409759521484, + "learning_rate": 1.6271229529952563e-08, + "logits/chosen": -2.232389211654663, + "logits/rejected": -2.217360258102417, + "logps/chosen": -186.85389709472656, + "logps/rejected": -220.95077514648438, + "loss": 0.5922, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -1.3120219707489014, + "rewards/margins": 0.3629273474216461, + "rewards/rejected": -1.674949288368225, + "step": 13270 + }, + { + "epoch": 2.288077188146106, + "grad_norm": 34.21379089355469, + "learning_rate": 1.619729792673114e-08, + "logits/chosen": -2.2794029712677, + "logits/rejected": -2.2498767375946045, + "logps/chosen": -177.8300018310547, + "logps/rejected": -207.9958953857422, + "loss": 0.6091, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -1.228564739227295, + "rewards/margins": 0.34510141611099243, + "rewards/rejected": -1.5736663341522217, + "step": 13280 + }, + { + "epoch": 2.289800137835975, + "grad_norm": 36.05464172363281, + "learning_rate": 1.6123502190208944e-08, + "logits/chosen": -2.2673323154449463, + "logits/rejected": -2.2458250522613525, + "logps/chosen": -180.65086364746094, + "logps/rejected": -213.90859985351562, + "loss": 0.5841, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -1.2555345296859741, + "rewards/margins": 0.35977333784103394, + "rewards/rejected": -1.6153080463409424, + "step": 13290 + }, + { + "epoch": 2.291523087525844, + "grad_norm": 28.938241958618164, + "learning_rate": 1.6049842617000826e-08, + "logits/chosen": -2.285695791244507, + "logits/rejected": -2.2649500370025635, + "logps/chosen": -190.43856811523438, + "logps/rejected": -215.55447387695312, + "loss": 0.6485, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -1.3869019746780396, + "rewards/margins": 0.2633451819419861, + "rewards/rejected": -1.65024733543396, + "step": 13300 + }, + { + "epoch": 2.293246037215713, + "grad_norm": 36.23344421386719, + "learning_rate": 1.5976319503174313e-08, + "logits/chosen": -2.284667491912842, + "logits/rejected": -2.2592196464538574, + "logps/chosen": -192.6880645751953, + "logps/rejected": -225.78768920898438, + "loss": 0.5965, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -1.3499374389648438, + "rewards/margins": 0.37441927194595337, + "rewards/rejected": -1.7243566513061523, + "step": 13310 + }, + { + "epoch": 2.2949689869055825, + "grad_norm": 52.024208068847656, + "learning_rate": 1.590293314424846e-08, + "logits/chosen": -2.314713716506958, + "logits/rejected": -2.2793240547180176, + "logps/chosen": -192.59432983398438, + "logps/rejected": -208.95361328125, + "loss": 0.6508, + "rewards/accuracies": 0.59375, + "rewards/chosen": -1.3572700023651123, + "rewards/margins": 0.25697070360183716, + "rewards/rejected": -1.6142408847808838, + "step": 13320 + }, + { + "epoch": 2.2966919365954515, + "grad_norm": 45.16383743286133, + "learning_rate": 1.582968383519267e-08, + "logits/chosen": -2.242969036102295, + "logits/rejected": -2.2187442779541016, + "logps/chosen": -181.7742919921875, + "logps/rejected": -214.463134765625, + "loss": 0.592, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -1.2493605613708496, + "rewards/margins": 0.40000343322753906, + "rewards/rejected": -1.6493641138076782, + "step": 13330 + }, + { + "epoch": 2.2984148862853204, + "grad_norm": 40.29065704345703, + "learning_rate": 1.5756571870425485e-08, + "logits/chosen": -2.341768264770508, + "logits/rejected": -2.325599193572998, + "logps/chosen": -188.314697265625, + "logps/rejected": -216.07229614257812, + "loss": 0.6231, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.3263964653015137, + "rewards/margins": 0.30297255516052246, + "rewards/rejected": -1.6293690204620361, + "step": 13340 + }, + { + "epoch": 2.3001378359751894, + "grad_norm": 39.64885711669922, + "learning_rate": 1.568359754381337e-08, + "logits/chosen": -2.3034918308258057, + "logits/rejected": -2.2614338397979736, + "logps/chosen": -176.49429321289062, + "logps/rejected": -216.81161499023438, + "loss": 0.563, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -1.1907312870025635, + "rewards/margins": 0.46979132294654846, + "rewards/rejected": -1.6605224609375, + "step": 13350 + }, + { + "epoch": 2.301860785665059, + "grad_norm": 40.885826110839844, + "learning_rate": 1.5610761148669588e-08, + "logits/chosen": -2.2991368770599365, + "logits/rejected": -2.2754249572753906, + "logps/chosen": -188.7205810546875, + "logps/rejected": -222.1409912109375, + "loss": 0.5961, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -1.3075730800628662, + "rewards/margins": 0.3844447731971741, + "rewards/rejected": -1.6920177936553955, + "step": 13360 + }, + { + "epoch": 2.3035837353549278, + "grad_norm": 47.9317512512207, + "learning_rate": 1.5538062977753007e-08, + "logits/chosen": -2.265645742416382, + "logits/rejected": -2.2291998863220215, + "logps/chosen": -186.75180053710938, + "logps/rejected": -219.0263671875, + "loss": 0.6174, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -1.3111344575881958, + "rewards/margins": 0.3566862940788269, + "rewards/rejected": -1.667820692062378, + "step": 13370 + }, + { + "epoch": 2.3053066850447967, + "grad_norm": 32.88581466674805, + "learning_rate": 1.5465503323266933e-08, + "logits/chosen": -2.2227225303649902, + "logits/rejected": -2.1929125785827637, + "logps/chosen": -188.45529174804688, + "logps/rejected": -215.32308959960938, + "loss": 0.6093, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -1.3009912967681885, + "rewards/margins": 0.3205062448978424, + "rewards/rejected": -1.6214977502822876, + "step": 13380 + }, + { + "epoch": 2.3070296347346657, + "grad_norm": 46.64186477661133, + "learning_rate": 1.539308247685787e-08, + "logits/chosen": -2.255293607711792, + "logits/rejected": -2.2119433879852295, + "logps/chosen": -175.30612182617188, + "logps/rejected": -201.49118041992188, + "loss": 0.6095, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -1.1783514022827148, + "rewards/margins": 0.32855120301246643, + "rewards/rejected": -1.5069026947021484, + "step": 13390 + }, + { + "epoch": 2.3087525844245347, + "grad_norm": 31.650501251220703, + "learning_rate": 1.532080072961442e-08, + "logits/chosen": -2.2712759971618652, + "logits/rejected": -2.240250825881958, + "logps/chosen": -171.73333740234375, + "logps/rejected": -209.0310516357422, + "loss": 0.5619, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -1.1551803350448608, + "rewards/margins": 0.4344462454319, + "rewards/rejected": -1.5896265506744385, + "step": 13400 + }, + { + "epoch": 2.3104755341144037, + "grad_norm": 31.78446388244629, + "learning_rate": 1.5248658372066107e-08, + "logits/chosen": -2.301515817642212, + "logits/rejected": -2.2714321613311768, + "logps/chosen": -182.88613891601562, + "logps/rejected": -215.9427947998047, + "loss": 0.5949, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -1.2810585498809814, + "rewards/margins": 0.37699708342552185, + "rewards/rejected": -1.6580556631088257, + "step": 13410 + }, + { + "epoch": 2.312198483804273, + "grad_norm": 42.18976974487305, + "learning_rate": 1.5176655694182156e-08, + "logits/chosen": -2.290903329849243, + "logits/rejected": -2.2751002311706543, + "logps/chosen": -177.2351531982422, + "logps/rejected": -218.5797119140625, + "loss": 0.5751, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -1.2314550876617432, + "rewards/margins": 0.40304645895957947, + "rewards/rejected": -1.6345014572143555, + "step": 13420 + }, + { + "epoch": 2.313921433494142, + "grad_norm": 36.520469665527344, + "learning_rate": 1.5104792985370406e-08, + "logits/chosen": -2.360677480697632, + "logits/rejected": -2.332749605178833, + "logps/chosen": -178.2684326171875, + "logps/rejected": -212.31723022460938, + "loss": 0.6023, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -1.2227405309677124, + "rewards/margins": 0.3327510952949524, + "rewards/rejected": -1.5554919242858887, + "step": 13430 + }, + { + "epoch": 2.315644383184011, + "grad_norm": 34.495445251464844, + "learning_rate": 1.5033070534476055e-08, + "logits/chosen": -2.2259278297424316, + "logits/rejected": -2.2103095054626465, + "logps/chosen": -172.03411865234375, + "logps/rejected": -199.85397338867188, + "loss": 0.6245, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -1.1800358295440674, + "rewards/margins": 0.302860826253891, + "rewards/rejected": -1.4828965663909912, + "step": 13440 + }, + { + "epoch": 2.31736733287388, + "grad_norm": 28.928640365600586, + "learning_rate": 1.4961488629780604e-08, + "logits/chosen": -2.2545201778411865, + "logits/rejected": -2.230071544647217, + "logps/chosen": -171.535400390625, + "logps/rejected": -207.2544403076172, + "loss": 0.5966, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.2039422988891602, + "rewards/margins": 0.3546209931373596, + "rewards/rejected": -1.558563470840454, + "step": 13450 + }, + { + "epoch": 2.3190902825637494, + "grad_norm": 21.376005172729492, + "learning_rate": 1.489004755900058e-08, + "logits/chosen": -2.2931625843048096, + "logits/rejected": -2.260887384414673, + "logps/chosen": -175.7183074951172, + "logps/rejected": -213.6696014404297, + "loss": 0.5651, + "rewards/accuracies": 0.71875, + "rewards/chosen": -1.1990082263946533, + "rewards/margins": 0.44000688195228577, + "rewards/rejected": -1.6390151977539062, + "step": 13460 + }, + { + "epoch": 2.3208132322536184, + "grad_norm": 34.317535400390625, + "learning_rate": 1.4818747609286486e-08, + "logits/chosen": -2.214077949523926, + "logits/rejected": -2.1879045963287354, + "logps/chosen": -185.1648406982422, + "logps/rejected": -211.9553680419922, + "loss": 0.6136, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -1.293401837348938, + "rewards/margins": 0.32934027910232544, + "rewards/rejected": -1.622741937637329, + "step": 13470 + }, + { + "epoch": 2.3225361819434873, + "grad_norm": 43.287086486816406, + "learning_rate": 1.4747589067221627e-08, + "logits/chosen": -2.3157906532287598, + "logits/rejected": -2.275646686553955, + "logps/chosen": -175.83084106445312, + "logps/rejected": -211.6923065185547, + "loss": 0.572, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -1.216455101966858, + "rewards/margins": 0.3852939009666443, + "rewards/rejected": -1.6017488241195679, + "step": 13480 + }, + { + "epoch": 2.3242591316333563, + "grad_norm": 33.92958450317383, + "learning_rate": 1.4676572218820831e-08, + "logits/chosen": -2.2876946926116943, + "logits/rejected": -2.2540640830993652, + "logps/chosen": -191.86166381835938, + "logps/rejected": -230.7425537109375, + "loss": 0.5832, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -1.3764475584030151, + "rewards/margins": 0.4016246795654297, + "rewards/rejected": -1.7780723571777344, + "step": 13490 + }, + { + "epoch": 2.3259820813232253, + "grad_norm": 35.81848907470703, + "learning_rate": 1.4605697349529494e-08, + "logits/chosen": -2.2865166664123535, + "logits/rejected": -2.25510573387146, + "logps/chosen": -187.5164337158203, + "logps/rejected": -223.8311004638672, + "loss": 0.5741, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.3080507516860962, + "rewards/margins": 0.41828426718711853, + "rewards/rejected": -1.7263351678848267, + "step": 13500 + }, + { + "epoch": 2.3277050310130942, + "grad_norm": 30.200706481933594, + "learning_rate": 1.4534964744222339e-08, + "logits/chosen": -2.218099594116211, + "logits/rejected": -2.2067201137542725, + "logps/chosen": -178.73098754882812, + "logps/rejected": -213.6283416748047, + "loss": 0.6113, + "rewards/accuracies": 0.6187499761581421, + "rewards/chosen": -1.28030526638031, + "rewards/margins": 0.31757602095603943, + "rewards/rejected": -1.5978811979293823, + "step": 13510 + }, + { + "epoch": 2.3294279807029636, + "grad_norm": 37.271827697753906, + "learning_rate": 1.4464374687202224e-08, + "logits/chosen": -2.188913583755493, + "logits/rejected": -2.1714272499084473, + "logps/chosen": -185.5612335205078, + "logps/rejected": -218.77609252929688, + "loss": 0.6041, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -1.3394790887832642, + "rewards/margins": 0.34229475259780884, + "rewards/rejected": -1.6817739009857178, + "step": 13520 + }, + { + "epoch": 2.3311509303928326, + "grad_norm": 39.160675048828125, + "learning_rate": 1.4393927462199062e-08, + "logits/chosen": -2.2882041931152344, + "logits/rejected": -2.2603230476379395, + "logps/chosen": -184.82806396484375, + "logps/rejected": -202.9477996826172, + "loss": 0.6465, + "rewards/accuracies": 0.606249988079071, + "rewards/chosen": -1.29298996925354, + "rewards/margins": 0.22285866737365723, + "rewards/rejected": -1.5158485174179077, + "step": 13530 + }, + { + "epoch": 2.3328738800827016, + "grad_norm": 39.556976318359375, + "learning_rate": 1.4323623352368691e-08, + "logits/chosen": -2.2315313816070557, + "logits/rejected": -2.1995949745178223, + "logps/chosen": -184.9795684814453, + "logps/rejected": -216.235595703125, + "loss": 0.5828, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -1.285422682762146, + "rewards/margins": 0.38336047530174255, + "rewards/rejected": -1.668783187866211, + "step": 13540 + }, + { + "epoch": 2.3345968297725705, + "grad_norm": 31.444944381713867, + "learning_rate": 1.4253462640291708e-08, + "logits/chosen": -2.2623448371887207, + "logits/rejected": -2.2332801818847656, + "logps/chosen": -185.16944885253906, + "logps/rejected": -216.3262176513672, + "loss": 0.6145, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -1.3362573385238647, + "rewards/margins": 0.3372352421283722, + "rewards/rejected": -1.6734927892684937, + "step": 13550 + }, + { + "epoch": 2.3363197794624395, + "grad_norm": 40.11167526245117, + "learning_rate": 1.4183445607972299e-08, + "logits/chosen": -2.2807250022888184, + "logits/rejected": -2.2724642753601074, + "logps/chosen": -179.11654663085938, + "logps/rejected": -201.07957458496094, + "loss": 0.6562, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.2554104328155518, + "rewards/margins": 0.2322661429643631, + "rewards/rejected": -1.4876763820648193, + "step": 13560 + }, + { + "epoch": 2.338042729152309, + "grad_norm": 35.94459915161133, + "learning_rate": 1.4113572536837192e-08, + "logits/chosen": -2.2167115211486816, + "logits/rejected": -2.1958794593811035, + "logps/chosen": -184.91183471679688, + "logps/rejected": -224.0436248779297, + "loss": 0.5753, + "rewards/accuracies": 0.71875, + "rewards/chosen": -1.325109601020813, + "rewards/margins": 0.4127611219882965, + "rewards/rejected": -1.7378708124160767, + "step": 13570 + }, + { + "epoch": 2.339765678842178, + "grad_norm": 31.295074462890625, + "learning_rate": 1.4043843707734448e-08, + "logits/chosen": -2.2628185749053955, + "logits/rejected": -2.2513587474823, + "logps/chosen": -179.8581085205078, + "logps/rejected": -209.72250366210938, + "loss": 0.6121, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -1.2665109634399414, + "rewards/margins": 0.3041861653327942, + "rewards/rejected": -1.5706971883773804, + "step": 13580 + }, + { + "epoch": 2.341488628532047, + "grad_norm": 54.577552795410156, + "learning_rate": 1.3974259400932348e-08, + "logits/chosen": -2.2290074825286865, + "logits/rejected": -2.2191624641418457, + "logps/chosen": -180.3831329345703, + "logps/rejected": -216.0695343017578, + "loss": 0.5961, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -1.2730530500411987, + "rewards/margins": 0.35534295439720154, + "rewards/rejected": -1.6283960342407227, + "step": 13590 + }, + { + "epoch": 2.343211578221916, + "grad_norm": 37.03809356689453, + "learning_rate": 1.3904819896118314e-08, + "logits/chosen": -2.283867597579956, + "logits/rejected": -2.2657036781311035, + "logps/chosen": -189.1884307861328, + "logps/rejected": -212.0028076171875, + "loss": 0.6245, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -1.2924267053604126, + "rewards/margins": 0.2556527256965637, + "rewards/rejected": -1.5480793714523315, + "step": 13600 + }, + { + "epoch": 2.343211578221916, + "eval_logits/chosen": -2.3582911491394043, + "eval_logits/rejected": -2.346229076385498, + "eval_logps/chosen": -167.07147216796875, + "eval_logps/rejected": -188.04969787597656, + "eval_loss": 0.6505503058433533, + "eval_rewards/accuracies": 0.598280668258667, + "eval_rewards/chosen": -1.0805600881576538, + "eval_rewards/margins": 0.172440767288208, + "eval_rewards/rejected": -1.2530008554458618, + "eval_runtime": 384.874, + "eval_samples_per_second": 11.183, + "eval_steps_per_second": 1.398, + "step": 13600 + }, + { + "epoch": 2.344934527911785, + "grad_norm": 36.64026641845703, + "learning_rate": 1.3835525472397747e-08, + "logits/chosen": -2.4126977920532227, + "logits/rejected": -2.387202262878418, + "logps/chosen": -173.5795135498047, + "logps/rejected": -202.99668884277344, + "loss": 0.625, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -1.1968533992767334, + "rewards/margins": 0.3015924096107483, + "rewards/rejected": -1.498445749282837, + "step": 13610 + }, + { + "epoch": 2.346657477601654, + "grad_norm": 30.19744110107422, + "learning_rate": 1.376637640829289e-08, + "logits/chosen": -2.2988693714141846, + "logits/rejected": -2.2542026042938232, + "logps/chosen": -177.09573364257812, + "logps/rejected": -212.8185272216797, + "loss": 0.5693, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -1.1949437856674194, + "rewards/margins": 0.42264899611473083, + "rewards/rejected": -1.6175928115844727, + "step": 13620 + }, + { + "epoch": 2.348380427291523, + "grad_norm": 35.095787048339844, + "learning_rate": 1.3697372981741707e-08, + "logits/chosen": -2.236766815185547, + "logits/rejected": -2.196708917617798, + "logps/chosen": -181.1944122314453, + "logps/rejected": -215.4459991455078, + "loss": 0.5747, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -1.2550218105316162, + "rewards/margins": 0.4060831665992737, + "rewards/rejected": -1.6611049175262451, + "step": 13630 + }, + { + "epoch": 2.350103376981392, + "grad_norm": 30.182931900024414, + "learning_rate": 1.362851547009684e-08, + "logits/chosen": -2.2480995655059814, + "logits/rejected": -2.221525192260742, + "logps/chosen": -175.38925170898438, + "logps/rejected": -218.19741821289062, + "loss": 0.5489, + "rewards/accuracies": 0.78125, + "rewards/chosen": -1.1962645053863525, + "rewards/margins": 0.45355424284935, + "rewards/rejected": -1.649818778038025, + "step": 13640 + }, + { + "epoch": 2.351826326671261, + "grad_norm": 33.286869049072266, + "learning_rate": 1.3559804150124421e-08, + "logits/chosen": -2.3198165893554688, + "logits/rejected": -2.2934446334838867, + "logps/chosen": -176.68719482421875, + "logps/rejected": -210.19088745117188, + "loss": 0.5757, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -1.2155816555023193, + "rewards/margins": 0.3787309527397156, + "rewards/rejected": -1.5943125486373901, + "step": 13650 + }, + { + "epoch": 2.35354927636113, + "grad_norm": 38.136558532714844, + "learning_rate": 1.3491239298002954e-08, + "logits/chosen": -2.2096331119537354, + "logits/rejected": -2.186317205429077, + "logps/chosen": -180.8411102294922, + "logps/rejected": -211.2050323486328, + "loss": 0.6122, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -1.2719953060150146, + "rewards/margins": 0.30958497524261475, + "rewards/rejected": -1.581580400466919, + "step": 13660 + }, + { + "epoch": 2.3552722260509995, + "grad_norm": 30.543697357177734, + "learning_rate": 1.3422821189322231e-08, + "logits/chosen": -2.297574520111084, + "logits/rejected": -2.273895740509033, + "logps/chosen": -183.28347778320312, + "logps/rejected": -207.8615264892578, + "loss": 0.6355, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -1.2745354175567627, + "rewards/margins": 0.2656957507133484, + "rewards/rejected": -1.5402309894561768, + "step": 13670 + }, + { + "epoch": 2.3569951757408685, + "grad_norm": 33.59134292602539, + "learning_rate": 1.3354550099082256e-08, + "logits/chosen": -2.309947967529297, + "logits/rejected": -2.279665470123291, + "logps/chosen": -179.7958984375, + "logps/rejected": -203.94461059570312, + "loss": 0.6041, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -1.2364991903305054, + "rewards/margins": 0.2978908121585846, + "rewards/rejected": -1.5343900918960571, + "step": 13680 + }, + { + "epoch": 2.3587181254307374, + "grad_norm": 45.42689895629883, + "learning_rate": 1.3286426301692105e-08, + "logits/chosen": -2.2824180126190186, + "logits/rejected": -2.250120162963867, + "logps/chosen": -177.89151000976562, + "logps/rejected": -214.27743530273438, + "loss": 0.5862, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -1.2323997020721436, + "rewards/margins": 0.3764374852180481, + "rewards/rejected": -1.6088371276855469, + "step": 13690 + }, + { + "epoch": 2.3604410751206064, + "grad_norm": 46.692657470703125, + "learning_rate": 1.321845007096879e-08, + "logits/chosen": -2.278366804122925, + "logits/rejected": -2.248579502105713, + "logps/chosen": -186.04518127441406, + "logps/rejected": -209.16122436523438, + "loss": 0.6153, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -1.263258695602417, + "rewards/margins": 0.30537840723991394, + "rewards/rejected": -1.5686371326446533, + "step": 13700 + }, + { + "epoch": 2.3621640248104754, + "grad_norm": 40.69031524658203, + "learning_rate": 1.3150621680136197e-08, + "logits/chosen": -2.233503818511963, + "logits/rejected": -2.1990857124328613, + "logps/chosen": -173.60745239257812, + "logps/rejected": -212.70382690429688, + "loss": 0.5709, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -1.1895692348480225, + "rewards/margins": 0.4038330912590027, + "rewards/rejected": -1.5934025049209595, + "step": 13710 + }, + { + "epoch": 2.3638869745003444, + "grad_norm": 37.574806213378906, + "learning_rate": 1.3082941401824027e-08, + "logits/chosen": -2.2590606212615967, + "logits/rejected": -2.2263731956481934, + "logps/chosen": -168.90953063964844, + "logps/rejected": -199.29470825195312, + "loss": 0.597, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -1.1364753246307373, + "rewards/margins": 0.3397006094455719, + "rewards/rejected": -1.4761759042739868, + "step": 13720 + }, + { + "epoch": 2.3656099241902138, + "grad_norm": 59.643516540527344, + "learning_rate": 1.30154095080666e-08, + "logits/chosen": -2.369276523590088, + "logits/rejected": -2.3432652950286865, + "logps/chosen": -175.1021270751953, + "logps/rejected": -216.71896362304688, + "loss": 0.5713, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -1.1881558895111084, + "rewards/margins": 0.4090685248374939, + "rewards/rejected": -1.597224473953247, + "step": 13730 + }, + { + "epoch": 2.3673328738800827, + "grad_norm": 30.843334197998047, + "learning_rate": 1.2948026270301853e-08, + "logits/chosen": -2.348867654800415, + "logits/rejected": -2.317417621612549, + "logps/chosen": -179.7681427001953, + "logps/rejected": -213.34017944335938, + "loss": 0.5824, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -1.2141062021255493, + "rewards/margins": 0.39131397008895874, + "rewards/rejected": -1.6054203510284424, + "step": 13740 + }, + { + "epoch": 2.3690558235699517, + "grad_norm": 39.30093002319336, + "learning_rate": 1.2880791959370235e-08, + "logits/chosen": -2.307525157928467, + "logits/rejected": -2.279491424560547, + "logps/chosen": -183.9978485107422, + "logps/rejected": -220.3440399169922, + "loss": 0.591, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -1.274763822555542, + "rewards/margins": 0.39177602529525757, + "rewards/rejected": -1.6665397882461548, + "step": 13750 + }, + { + "epoch": 2.3707787732598207, + "grad_norm": 46.24401092529297, + "learning_rate": 1.2813706845513556e-08, + "logits/chosen": -2.359976291656494, + "logits/rejected": -2.327162504196167, + "logps/chosen": -171.99435424804688, + "logps/rejected": -200.48883056640625, + "loss": 0.6053, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -1.1655842065811157, + "rewards/margins": 0.3405342102050781, + "rewards/rejected": -1.5061182975769043, + "step": 13760 + }, + { + "epoch": 2.37250172294969, + "grad_norm": 43.86614227294922, + "learning_rate": 1.274677119837393e-08, + "logits/chosen": -2.3412115573883057, + "logits/rejected": -2.3285574913024902, + "logps/chosen": -178.02932739257812, + "logps/rejected": -214.586181640625, + "loss": 0.5976, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -1.265352725982666, + "rewards/margins": 0.34253057837486267, + "rewards/rejected": -1.6078834533691406, + "step": 13770 + }, + { + "epoch": 2.374224672639559, + "grad_norm": 31.504146575927734, + "learning_rate": 1.2679985286992762e-08, + "logits/chosen": -2.3894755840301514, + "logits/rejected": -2.3525168895721436, + "logps/chosen": -188.07192993164062, + "logps/rejected": -205.6201171875, + "loss": 0.6331, + "rewards/accuracies": 0.65625, + "rewards/chosen": -1.280479073524475, + "rewards/margins": 0.26705050468444824, + "rewards/rejected": -1.5475298166275024, + "step": 13780 + }, + { + "epoch": 2.375947622329428, + "grad_norm": 40.06073760986328, + "learning_rate": 1.2613349379809596e-08, + "logits/chosen": -2.299266815185547, + "logits/rejected": -2.271803140640259, + "logps/chosen": -181.13267517089844, + "logps/rejected": -212.23593139648438, + "loss": 0.5869, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -1.2333463430404663, + "rewards/margins": 0.36941683292388916, + "rewards/rejected": -1.6027634143829346, + "step": 13790 + }, + { + "epoch": 2.377670572019297, + "grad_norm": 31.72646141052246, + "learning_rate": 1.2546863744660975e-08, + "logits/chosen": -2.3663368225097656, + "logits/rejected": -2.3349125385284424, + "logps/chosen": -174.5872802734375, + "logps/rejected": -204.26718139648438, + "loss": 0.581, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -1.1611812114715576, + "rewards/margins": 0.3709130883216858, + "rewards/rejected": -1.5320942401885986, + "step": 13800 + }, + { + "epoch": 2.379393521709166, + "grad_norm": 39.007442474365234, + "learning_rate": 1.2480528648779532e-08, + "logits/chosen": -2.2920098304748535, + "logits/rejected": -2.2649521827697754, + "logps/chosen": -164.62759399414062, + "logps/rejected": -201.13555908203125, + "loss": 0.5844, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -1.10493803024292, + "rewards/margins": 0.37880510091781616, + "rewards/rejected": -1.4837430715560913, + "step": 13810 + }, + { + "epoch": 2.381116471399035, + "grad_norm": 32.280784606933594, + "learning_rate": 1.2414344358792784e-08, + "logits/chosen": -2.339385747909546, + "logits/rejected": -2.3074727058410645, + "logps/chosen": -177.5450439453125, + "logps/rejected": -200.82669067382812, + "loss": 0.6071, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -1.166850209236145, + "rewards/margins": 0.3151569664478302, + "rewards/rejected": -1.4820071458816528, + "step": 13820 + }, + { + "epoch": 2.3828394210889043, + "grad_norm": 30.069440841674805, + "learning_rate": 1.2348311140722079e-08, + "logits/chosen": -2.3644509315490723, + "logits/rejected": -2.3493714332580566, + "logps/chosen": -170.24691772460938, + "logps/rejected": -197.38812255859375, + "loss": 0.5992, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -1.1383792161941528, + "rewards/margins": 0.3191404938697815, + "rewards/rejected": -1.45751953125, + "step": 13830 + }, + { + "epoch": 2.3845623707787733, + "grad_norm": 41.18797302246094, + "learning_rate": 1.2282429259981597e-08, + "logits/chosen": -2.3379101753234863, + "logits/rejected": -2.30659556388855, + "logps/chosen": -175.2816162109375, + "logps/rejected": -193.45132446289062, + "loss": 0.6339, + "rewards/accuracies": 0.606249988079071, + "rewards/chosen": -1.177595853805542, + "rewards/margins": 0.24431820213794708, + "rewards/rejected": -1.4219141006469727, + "step": 13840 + }, + { + "epoch": 2.3862853204686423, + "grad_norm": 36.08599090576172, + "learning_rate": 1.221669898137716e-08, + "logits/chosen": -2.2660117149353027, + "logits/rejected": -2.2420566082000732, + "logps/chosen": -171.4685516357422, + "logps/rejected": -193.0071258544922, + "loss": 0.6317, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -1.16433584690094, + "rewards/margins": 0.2799225449562073, + "rewards/rejected": -1.4442580938339233, + "step": 13850 + }, + { + "epoch": 2.3880082701585112, + "grad_norm": 31.845666885375977, + "learning_rate": 1.2151120569105316e-08, + "logits/chosen": -2.2724967002868652, + "logits/rejected": -2.2546486854553223, + "logps/chosen": -175.4547882080078, + "logps/rejected": -204.92172241210938, + "loss": 0.6033, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -1.2325869798660278, + "rewards/margins": 0.310187965631485, + "rewards/rejected": -1.5427749156951904, + "step": 13860 + }, + { + "epoch": 2.3897312198483807, + "grad_norm": 35.77281951904297, + "learning_rate": 1.208569428675214e-08, + "logits/chosen": -2.310751438140869, + "logits/rejected": -2.2911205291748047, + "logps/chosen": -185.12376403808594, + "logps/rejected": -212.27761840820312, + "loss": 0.6249, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.2908276319503784, + "rewards/margins": 0.3130032420158386, + "rewards/rejected": -1.6038309335708618, + "step": 13870 + }, + { + "epoch": 2.3914541695382496, + "grad_norm": 27.441791534423828, + "learning_rate": 1.2020420397292285e-08, + "logits/chosen": -2.2574360370635986, + "logits/rejected": -2.2178995609283447, + "logps/chosen": -170.9420166015625, + "logps/rejected": -205.03140258789062, + "loss": 0.5863, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -1.1549043655395508, + "rewards/margins": 0.37761181592941284, + "rewards/rejected": -1.5325162410736084, + "step": 13880 + }, + { + "epoch": 2.3931771192281186, + "grad_norm": 25.524450302124023, + "learning_rate": 1.1955299163087818e-08, + "logits/chosen": -2.333522081375122, + "logits/rejected": -2.3122828006744385, + "logps/chosen": -172.7584991455078, + "logps/rejected": -199.32009887695312, + "loss": 0.6054, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -1.1644203662872314, + "rewards/margins": 0.3049108386039734, + "rewards/rejected": -1.46933114528656, + "step": 13890 + }, + { + "epoch": 2.3949000689179876, + "grad_norm": 33.01766586303711, + "learning_rate": 1.1890330845887292e-08, + "logits/chosen": -2.256608724594116, + "logits/rejected": -2.2248635292053223, + "logps/chosen": -166.27073669433594, + "logps/rejected": -197.65635681152344, + "loss": 0.5851, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.1234867572784424, + "rewards/margins": 0.36823463439941406, + "rewards/rejected": -1.4917213916778564, + "step": 13900 + }, + { + "epoch": 2.3966230186078565, + "grad_norm": 35.33543395996094, + "learning_rate": 1.1825515706824563e-08, + "logits/chosen": -2.284083127975464, + "logits/rejected": -2.260112762451172, + "logps/chosen": -166.86380004882812, + "logps/rejected": -190.37025451660156, + "loss": 0.6052, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -1.0944288969039917, + "rewards/margins": 0.29038792848587036, + "rewards/rejected": -1.3848168849945068, + "step": 13910 + }, + { + "epoch": 2.3983459682977255, + "grad_norm": 43.38460922241211, + "learning_rate": 1.1760854006417848e-08, + "logits/chosen": -2.335136890411377, + "logits/rejected": -2.299079418182373, + "logps/chosen": -177.9640350341797, + "logps/rejected": -202.99925231933594, + "loss": 0.5887, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -1.1683671474456787, + "rewards/margins": 0.34926632046699524, + "rewards/rejected": -1.5176336765289307, + "step": 13920 + }, + { + "epoch": 2.400068917987595, + "grad_norm": 32.38801574707031, + "learning_rate": 1.1696346004568597e-08, + "logits/chosen": -2.2909762859344482, + "logits/rejected": -2.268258571624756, + "logps/chosen": -163.80203247070312, + "logps/rejected": -187.72994995117188, + "loss": 0.6266, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -1.1165748834609985, + "rewards/margins": 0.25797998905181885, + "rewards/rejected": -1.3745548725128174, + "step": 13930 + }, + { + "epoch": 2.401791867677464, + "grad_norm": 28.899768829345703, + "learning_rate": 1.1631991960560494e-08, + "logits/chosen": -2.264949083328247, + "logits/rejected": -2.2347309589385986, + "logps/chosen": -162.36122131347656, + "logps/rejected": -207.45498657226562, + "loss": 0.5682, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -1.0848444700241089, + "rewards/margins": 0.43307724595069885, + "rewards/rejected": -1.5179216861724854, + "step": 13940 + }, + { + "epoch": 2.403514817367333, + "grad_norm": 36.21199035644531, + "learning_rate": 1.1567792133058418e-08, + "logits/chosen": -2.3258869647979736, + "logits/rejected": -2.300102949142456, + "logps/chosen": -171.20603942871094, + "logps/rejected": -201.0448455810547, + "loss": 0.618, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -1.1686848402023315, + "rewards/margins": 0.3188968300819397, + "rewards/rejected": -1.487581729888916, + "step": 13950 + }, + { + "epoch": 2.405237767057202, + "grad_norm": 32.96730041503906, + "learning_rate": 1.1503746780107394e-08, + "logits/chosen": -2.192915439605713, + "logits/rejected": -2.1882834434509277, + "logps/chosen": -172.82211303710938, + "logps/rejected": -193.9451141357422, + "loss": 0.6298, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -1.1832388639450073, + "rewards/margins": 0.23356863856315613, + "rewards/rejected": -1.4168074131011963, + "step": 13960 + }, + { + "epoch": 2.406960716747071, + "grad_norm": 28.457719802856445, + "learning_rate": 1.1439856159131528e-08, + "logits/chosen": -2.2863781452178955, + "logits/rejected": -2.2556209564208984, + "logps/chosen": -173.32937622070312, + "logps/rejected": -203.6952667236328, + "loss": 0.5837, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -1.1314260959625244, + "rewards/margins": 0.3794175982475281, + "rewards/rejected": -1.5108436346054077, + "step": 13970 + }, + { + "epoch": 2.40868366643694, + "grad_norm": 30.23308753967285, + "learning_rate": 1.1376120526932987e-08, + "logits/chosen": -2.259094476699829, + "logits/rejected": -2.244642734527588, + "logps/chosen": -175.16502380371094, + "logps/rejected": -211.3958740234375, + "loss": 0.5767, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -1.1950197219848633, + "rewards/margins": 0.38072091341018677, + "rewards/rejected": -1.5757405757904053, + "step": 13980 + }, + { + "epoch": 2.410406616126809, + "grad_norm": 41.41474914550781, + "learning_rate": 1.1312540139691012e-08, + "logits/chosen": -2.288893222808838, + "logits/rejected": -2.2532081604003906, + "logps/chosen": -172.82003784179688, + "logps/rejected": -202.6908721923828, + "loss": 0.5953, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -1.1435893774032593, + "rewards/margins": 0.3480374217033386, + "rewards/rejected": -1.4916269779205322, + "step": 13990 + }, + { + "epoch": 2.412129565816678, + "grad_norm": 39.095706939697266, + "learning_rate": 1.1249115252960845e-08, + "logits/chosen": -2.2197909355163574, + "logits/rejected": -2.1884961128234863, + "logps/chosen": -172.7142333984375, + "logps/rejected": -213.0373992919922, + "loss": 0.5716, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -1.2112494707107544, + "rewards/margins": 0.39283236861228943, + "rewards/rejected": -1.6040818691253662, + "step": 14000 + }, + { + "epoch": 2.412129565816678, + "eval_logits/chosen": -2.3650660514831543, + "eval_logits/rejected": -2.353348970413208, + "eval_logps/chosen": -162.07864379882812, + "eval_logps/rejected": -182.53675842285156, + "eval_loss": 0.6510778069496155, + "eval_rewards/accuracies": 0.5940985083580017, + "eval_rewards/chosen": -1.0306316614151, + "eval_rewards/margins": 0.16723977029323578, + "eval_rewards/rejected": -1.197871446609497, + "eval_runtime": 384.5746, + "eval_samples_per_second": 11.192, + "eval_steps_per_second": 1.399, + "step": 14000 + }, + { + "epoch": 2.413852515506547, + "grad_norm": 31.06534767150879, + "learning_rate": 1.1185846121672677e-08, + "logits/chosen": -2.2538845539093018, + "logits/rejected": -2.2423095703125, + "logps/chosen": -177.99278259277344, + "logps/rejected": -201.70272827148438, + "loss": 0.6277, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -1.2160732746124268, + "rewards/margins": 0.27514421939849854, + "rewards/rejected": -1.4912176132202148, + "step": 14010 + }, + { + "epoch": 2.415575465196416, + "grad_norm": 32.03325271606445, + "learning_rate": 1.1122733000130697e-08, + "logits/chosen": -2.268951892852783, + "logits/rejected": -2.257387638092041, + "logps/chosen": -176.60130310058594, + "logps/rejected": -198.1162567138672, + "loss": 0.6351, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -1.2103415727615356, + "rewards/margins": 0.26508891582489014, + "rewards/rejected": -1.4754304885864258, + "step": 14020 + }, + { + "epoch": 2.4172984148862855, + "grad_norm": 37.38322448730469, + "learning_rate": 1.1059776142011995e-08, + "logits/chosen": -2.2546093463897705, + "logits/rejected": -2.2319176197052, + "logps/chosen": -181.75502014160156, + "logps/rejected": -207.01553344726562, + "loss": 0.6094, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.2745532989501953, + "rewards/margins": 0.3088175356388092, + "rewards/rejected": -1.583370566368103, + "step": 14030 + }, + { + "epoch": 2.4190213645761545, + "grad_norm": 36.754966735839844, + "learning_rate": 1.0996975800365577e-08, + "logits/chosen": -2.2602710723876953, + "logits/rejected": -2.2106664180755615, + "logps/chosen": -176.01315307617188, + "logps/rejected": -209.2595672607422, + "loss": 0.5703, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -1.1861064434051514, + "rewards/margins": 0.4158700406551361, + "rewards/rejected": -1.6019766330718994, + "step": 14040 + }, + { + "epoch": 2.4207443142660234, + "grad_norm": 31.584936141967773, + "learning_rate": 1.0934332227611365e-08, + "logits/chosen": -2.2989883422851562, + "logits/rejected": -2.2649173736572266, + "logps/chosen": -166.5214385986328, + "logps/rejected": -189.67869567871094, + "loss": 0.6205, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.1098644733428955, + "rewards/margins": 0.28757235407829285, + "rewards/rejected": -1.3974368572235107, + "step": 14050 + }, + { + "epoch": 2.4224672639558924, + "grad_norm": 26.96416664123535, + "learning_rate": 1.0871845675539166e-08, + "logits/chosen": -2.318040609359741, + "logits/rejected": -2.2748923301696777, + "logps/chosen": -171.8230438232422, + "logps/rejected": -212.2820281982422, + "loss": 0.5522, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -1.1478931903839111, + "rewards/margins": 0.4663833677768707, + "rewards/rejected": -1.6142765283584595, + "step": 14060 + }, + { + "epoch": 2.4241902136457614, + "grad_norm": 33.14677429199219, + "learning_rate": 1.0809516395307644e-08, + "logits/chosen": -2.2845077514648438, + "logits/rejected": -2.2546377182006836, + "logps/chosen": -177.21847534179688, + "logps/rejected": -213.17330932617188, + "loss": 0.5825, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -1.2126785516738892, + "rewards/margins": 0.3783145546913147, + "rewards/rejected": -1.5909929275512695, + "step": 14070 + }, + { + "epoch": 2.425913163335631, + "grad_norm": 27.923873901367188, + "learning_rate": 1.07473446374433e-08, + "logits/chosen": -2.2292556762695312, + "logits/rejected": -2.1985204219818115, + "logps/chosen": -191.96128845214844, + "logps/rejected": -218.0422821044922, + "loss": 0.6239, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -1.3302682638168335, + "rewards/margins": 0.32979243993759155, + "rewards/rejected": -1.6600606441497803, + "step": 14080 + }, + { + "epoch": 2.4276361130254998, + "grad_norm": 34.443580627441406, + "learning_rate": 1.0685330651839542e-08, + "logits/chosen": -2.264598846435547, + "logits/rejected": -2.2283425331115723, + "logps/chosen": -173.8609161376953, + "logps/rejected": -199.7482147216797, + "loss": 0.6073, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.1643470525741577, + "rewards/margins": 0.32152336835861206, + "rewards/rejected": -1.485870361328125, + "step": 14090 + }, + { + "epoch": 2.4293590627153687, + "grad_norm": 34.7966423034668, + "learning_rate": 1.0623474687755607e-08, + "logits/chosen": -2.3133583068847656, + "logits/rejected": -2.2820286750793457, + "logps/chosen": -178.41860961914062, + "logps/rejected": -213.8348388671875, + "loss": 0.5781, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -1.1884939670562744, + "rewards/margins": 0.4087247848510742, + "rewards/rejected": -1.5972187519073486, + "step": 14100 + }, + { + "epoch": 2.4310820124052377, + "grad_norm": 36.908451080322266, + "learning_rate": 1.0561776993815563e-08, + "logits/chosen": -2.309270143508911, + "logits/rejected": -2.295470714569092, + "logps/chosen": -174.90457153320312, + "logps/rejected": -204.77542114257812, + "loss": 0.6114, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -1.2425603866577148, + "rewards/margins": 0.30024009943008423, + "rewards/rejected": -1.5428004264831543, + "step": 14110 + }, + { + "epoch": 2.4328049620951067, + "grad_norm": 47.489341735839844, + "learning_rate": 1.0500237818007318e-08, + "logits/chosen": -2.2908339500427246, + "logits/rejected": -2.26305890083313, + "logps/chosen": -173.8285369873047, + "logps/rejected": -197.50125122070312, + "loss": 0.6355, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -1.2086008787155151, + "rewards/margins": 0.24783003330230713, + "rewards/rejected": -1.4564310312271118, + "step": 14120 + }, + { + "epoch": 2.4345279117849756, + "grad_norm": 43.21564483642578, + "learning_rate": 1.0438857407681683e-08, + "logits/chosen": -2.3160407543182373, + "logits/rejected": -2.2969260215759277, + "logps/chosen": -175.17642211914062, + "logps/rejected": -190.04049682617188, + "loss": 0.6502, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -1.1963536739349365, + "rewards/margins": 0.1892521232366562, + "rewards/rejected": -1.3856055736541748, + "step": 14130 + }, + { + "epoch": 2.436250861474845, + "grad_norm": 27.080957412719727, + "learning_rate": 1.0377636009551271e-08, + "logits/chosen": -2.358844041824341, + "logits/rejected": -2.3329920768737793, + "logps/chosen": -178.12252807617188, + "logps/rejected": -206.2164306640625, + "loss": 0.6192, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -1.2507833242416382, + "rewards/margins": 0.30481910705566406, + "rewards/rejected": -1.5556023120880127, + "step": 14140 + }, + { + "epoch": 2.437973811164714, + "grad_norm": 38.455928802490234, + "learning_rate": 1.0316573869689605e-08, + "logits/chosen": -2.3427817821502686, + "logits/rejected": -2.3219618797302246, + "logps/chosen": -176.49655151367188, + "logps/rejected": -207.5787353515625, + "loss": 0.6135, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -1.2148641347885132, + "rewards/margins": 0.3144477903842926, + "rewards/rejected": -1.529312014579773, + "step": 14150 + }, + { + "epoch": 2.439696760854583, + "grad_norm": 38.341854095458984, + "learning_rate": 1.025567123353004e-08, + "logits/chosen": -2.289370059967041, + "logits/rejected": -2.265319347381592, + "logps/chosen": -171.89669799804688, + "logps/rejected": -205.0989532470703, + "loss": 0.6089, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -1.1749041080474854, + "rewards/margins": 0.3178651034832001, + "rewards/rejected": -1.4927692413330078, + "step": 14160 + }, + { + "epoch": 2.441419710544452, + "grad_norm": 36.23020553588867, + "learning_rate": 1.0194928345864867e-08, + "logits/chosen": -2.2404868602752686, + "logits/rejected": -2.211150646209717, + "logps/chosen": -172.58767700195312, + "logps/rejected": -199.41384887695312, + "loss": 0.6132, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -1.1612130403518677, + "rewards/margins": 0.2983928918838501, + "rewards/rejected": -1.4596058130264282, + "step": 14170 + }, + { + "epoch": 2.4431426602343214, + "grad_norm": 44.42694854736328, + "learning_rate": 1.0134345450844245e-08, + "logits/chosen": -2.301393985748291, + "logits/rejected": -2.264596462249756, + "logps/chosen": -177.45223999023438, + "logps/rejected": -199.0797882080078, + "loss": 0.6199, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -1.1935441493988037, + "rewards/margins": 0.28957706689834595, + "rewards/rejected": -1.4831211566925049, + "step": 14180 + }, + { + "epoch": 2.4448656099241903, + "grad_norm": 30.67015838623047, + "learning_rate": 1.0073922791975276e-08, + "logits/chosen": -2.320544719696045, + "logits/rejected": -2.30179500579834, + "logps/chosen": -181.50807189941406, + "logps/rejected": -204.28504943847656, + "loss": 0.6171, + "rewards/accuracies": 0.65625, + "rewards/chosen": -1.2201424837112427, + "rewards/margins": 0.28387171030044556, + "rewards/rejected": -1.504014253616333, + "step": 14190 + }, + { + "epoch": 2.4465885596140593, + "grad_norm": 32.7570686340332, + "learning_rate": 1.0013660612121034e-08, + "logits/chosen": -2.2290682792663574, + "logits/rejected": -2.203991651535034, + "logps/chosen": -166.692626953125, + "logps/rejected": -206.97122192382812, + "loss": 0.5601, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -1.1180620193481445, + "rewards/margins": 0.41545921564102173, + "rewards/rejected": -1.533521294593811, + "step": 14200 + }, + { + "epoch": 2.4483115093039283, + "grad_norm": 33.343238830566406, + "learning_rate": 9.953559153499509e-09, + "logits/chosen": -2.3083038330078125, + "logits/rejected": -2.285892963409424, + "logps/chosen": -172.07347106933594, + "logps/rejected": -201.3411102294922, + "loss": 0.6103, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -1.1820226907730103, + "rewards/margins": 0.31112828850746155, + "rewards/rejected": -1.4931509494781494, + "step": 14210 + }, + { + "epoch": 2.4500344589937972, + "grad_norm": 31.730648040771484, + "learning_rate": 9.893618657682712e-09, + "logits/chosen": -2.359067440032959, + "logits/rejected": -2.3279736042022705, + "logps/chosen": -176.6471710205078, + "logps/rejected": -201.74581909179688, + "loss": 0.6194, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -1.1735197305679321, + "rewards/margins": 0.298747718334198, + "rewards/rejected": -1.472267508506775, + "step": 14220 + }, + { + "epoch": 2.451757408683666, + "grad_norm": 34.798095703125, + "learning_rate": 9.833839365595686e-09, + "logits/chosen": -2.236711025238037, + "logits/rejected": -2.1991546154022217, + "logps/chosen": -172.93850708007812, + "logps/rejected": -207.1778564453125, + "loss": 0.5772, + "rewards/accuracies": 0.71875, + "rewards/chosen": -1.155918836593628, + "rewards/margins": 0.40659481287002563, + "rewards/rejected": -1.5625135898590088, + "step": 14230 + }, + { + "epoch": 2.4534803583735356, + "grad_norm": 35.521636962890625, + "learning_rate": 9.774221517515563e-09, + "logits/chosen": -2.2360782623291016, + "logits/rejected": -2.2201473712921143, + "logps/chosen": -174.67828369140625, + "logps/rejected": -210.30056762695312, + "loss": 0.5886, + "rewards/accuracies": 0.71875, + "rewards/chosen": -1.1995961666107178, + "rewards/margins": 0.3683069348335266, + "rewards/rejected": -1.5679031610488892, + "step": 14240 + }, + { + "epoch": 2.4552033080634046, + "grad_norm": 42.19344711303711, + "learning_rate": 9.71476535307047e-09, + "logits/chosen": -2.2473440170288086, + "logits/rejected": -2.2286272048950195, + "logps/chosen": -177.55215454101562, + "logps/rejected": -201.4174346923828, + "loss": 0.6289, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -1.2225806713104248, + "rewards/margins": 0.2655474543571472, + "rewards/rejected": -1.4881280660629272, + "step": 14250 + }, + { + "epoch": 2.4569262577532736, + "grad_norm": 61.28897476196289, + "learning_rate": 9.65547111123875e-09, + "logits/chosen": -2.3363399505615234, + "logits/rejected": -2.2926130294799805, + "logps/chosen": -177.8993682861328, + "logps/rejected": -199.60348510742188, + "loss": 0.6195, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -1.2001395225524902, + "rewards/margins": 0.3035646080970764, + "rewards/rejected": -1.503704309463501, + "step": 14260 + }, + { + "epoch": 2.4586492074431425, + "grad_norm": 23.33378791809082, + "learning_rate": 9.596339030347906e-09, + "logits/chosen": -2.302767038345337, + "logits/rejected": -2.270862340927124, + "logps/chosen": -175.49441528320312, + "logps/rejected": -213.15371704101562, + "loss": 0.5656, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -1.200317144393921, + "rewards/margins": 0.42555421590805054, + "rewards/rejected": -1.6258713006973267, + "step": 14270 + }, + { + "epoch": 2.460372157133012, + "grad_norm": 31.418357849121094, + "learning_rate": 9.537369348073598e-09, + "logits/chosen": -2.235307216644287, + "logits/rejected": -2.2249467372894287, + "logps/chosen": -173.64279174804688, + "logps/rejected": -194.50186157226562, + "loss": 0.6545, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.2113587856292725, + "rewards/margins": 0.2167220413684845, + "rewards/rejected": -1.4280807971954346, + "step": 14280 + }, + { + "epoch": 2.462095106822881, + "grad_norm": 32.423194885253906, + "learning_rate": 9.478562301438809e-09, + "logits/chosen": -2.2253098487854004, + "logits/rejected": -2.193913459777832, + "logps/chosen": -187.88681030273438, + "logps/rejected": -204.9330291748047, + "loss": 0.6332, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.266350507736206, + "rewards/margins": 0.26731666922569275, + "rewards/rejected": -1.5336672067642212, + "step": 14290 + }, + { + "epoch": 2.46381805651275, + "grad_norm": 32.8293342590332, + "learning_rate": 9.419918126812748e-09, + "logits/chosen": -2.347121238708496, + "logits/rejected": -2.325183868408203, + "logps/chosen": -163.93734741210938, + "logps/rejected": -206.36135864257812, + "loss": 0.5616, + "rewards/accuracies": 0.71875, + "rewards/chosen": -1.0888183116912842, + "rewards/margins": 0.4189375340938568, + "rewards/rejected": -1.507755994796753, + "step": 14300 + }, + { + "epoch": 2.465541006202619, + "grad_norm": 32.74689865112305, + "learning_rate": 9.361437059910055e-09, + "logits/chosen": -2.2703864574432373, + "logits/rejected": -2.243338108062744, + "logps/chosen": -176.22911071777344, + "logps/rejected": -205.7763214111328, + "loss": 0.6056, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -1.216325044631958, + "rewards/margins": 0.33054304122924805, + "rewards/rejected": -1.546868085861206, + "step": 14310 + }, + { + "epoch": 2.467263955892488, + "grad_norm": 30.113887786865234, + "learning_rate": 9.303119335789705e-09, + "logits/chosen": -2.255998134613037, + "logits/rejected": -2.2358832359313965, + "logps/chosen": -167.65725708007812, + "logps/rejected": -203.99642944335938, + "loss": 0.5832, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -1.1159486770629883, + "rewards/margins": 0.37771984934806824, + "rewards/rejected": -1.493668556213379, + "step": 14320 + }, + { + "epoch": 2.468986905582357, + "grad_norm": 43.85342788696289, + "learning_rate": 9.244965188854186e-09, + "logits/chosen": -2.3707833290100098, + "logits/rejected": -2.330193042755127, + "logps/chosen": -178.6895294189453, + "logps/rejected": -234.42626953125, + "loss": 0.5207, + "rewards/accuracies": 0.8187500238418579, + "rewards/chosen": -1.2279475927352905, + "rewards/margins": 0.5815509557723999, + "rewards/rejected": -1.8094985485076904, + "step": 14330 + }, + { + "epoch": 2.470709855272226, + "grad_norm": 40.238525390625, + "learning_rate": 9.186974852848467e-09, + "logits/chosen": -2.2677359580993652, + "logits/rejected": -2.2470545768737793, + "logps/chosen": -178.6530303955078, + "logps/rejected": -221.50711059570312, + "loss": 0.5663, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -1.2290410995483398, + "rewards/margins": 0.4284849166870117, + "rewards/rejected": -1.6575257778167725, + "step": 14340 + }, + { + "epoch": 2.472432804962095, + "grad_norm": 29.78543472290039, + "learning_rate": 9.129148560859102e-09, + "logits/chosen": -2.3092570304870605, + "logits/rejected": -2.286848306655884, + "logps/chosen": -175.47021484375, + "logps/rejected": -209.6531524658203, + "loss": 0.5908, + "rewards/accuracies": 0.65625, + "rewards/chosen": -1.2112047672271729, + "rewards/margins": 0.35105112195014954, + "rewards/rejected": -1.562256097793579, + "step": 14350 + }, + { + "epoch": 2.474155754651964, + "grad_norm": 46.5627555847168, + "learning_rate": 9.0714865453133e-09, + "logits/chosen": -2.2718825340270996, + "logits/rejected": -2.2470686435699463, + "logps/chosen": -187.2011260986328, + "logps/rejected": -211.96572875976562, + "loss": 0.6279, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -1.307032823562622, + "rewards/margins": 0.2946831285953522, + "rewards/rejected": -1.6017158031463623, + "step": 14360 + }, + { + "epoch": 2.475878704341833, + "grad_norm": 40.91022491455078, + "learning_rate": 9.013989037977977e-09, + "logits/chosen": -2.29414701461792, + "logits/rejected": -2.2601523399353027, + "logps/chosen": -181.77938842773438, + "logps/rejected": -213.543701171875, + "loss": 0.5905, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.2551586627960205, + "rewards/margins": 0.3801318109035492, + "rewards/rejected": -1.635290503501892, + "step": 14370 + }, + { + "epoch": 2.4776016540317025, + "grad_norm": 32.45248794555664, + "learning_rate": 8.956656269958812e-09, + "logits/chosen": -2.358607769012451, + "logits/rejected": -2.3335189819335938, + "logps/chosen": -181.627685546875, + "logps/rejected": -211.6080322265625, + "loss": 0.5967, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -1.2830419540405273, + "rewards/margins": 0.32768845558166504, + "rewards/rejected": -1.610730528831482, + "step": 14380 + }, + { + "epoch": 2.4793246037215715, + "grad_norm": 39.58088302612305, + "learning_rate": 8.899488471699312e-09, + "logits/chosen": -2.2676587104797363, + "logits/rejected": -2.2427353858947754, + "logps/chosen": -183.3169708251953, + "logps/rejected": -212.7167205810547, + "loss": 0.6191, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -1.300159215927124, + "rewards/margins": 0.3232465386390686, + "rewards/rejected": -1.6234058141708374, + "step": 14390 + }, + { + "epoch": 2.4810475534114405, + "grad_norm": 38.66813659667969, + "learning_rate": 8.842485872979944e-09, + "logits/chosen": -2.309758186340332, + "logits/rejected": -2.2829182147979736, + "logps/chosen": -185.3292999267578, + "logps/rejected": -210.6401824951172, + "loss": 0.6078, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -1.2728933095932007, + "rewards/margins": 0.3427709639072418, + "rewards/rejected": -1.6156641244888306, + "step": 14400 + }, + { + "epoch": 2.4810475534114405, + "eval_logits/chosen": -2.3539810180664062, + "eval_logits/rejected": -2.3417115211486816, + "eval_logps/chosen": -167.90591430664062, + "eval_logps/rejected": -189.16842651367188, + "eval_loss": 0.6505683064460754, + "eval_rewards/accuracies": 0.6003717184066772, + "eval_rewards/chosen": -1.0889043807983398, + "eval_rewards/margins": 0.17528373003005981, + "eval_rewards/rejected": -1.2641881704330444, + "eval_runtime": 384.7222, + "eval_samples_per_second": 11.187, + "eval_steps_per_second": 1.398, + "step": 14400 + }, + { + "epoch": 2.4827705031013094, + "grad_norm": 30.45815086364746, + "learning_rate": 8.785648702917164e-09, + "logits/chosen": -2.274064064025879, + "logits/rejected": -2.252723217010498, + "logps/chosen": -175.93069458007812, + "logps/rejected": -208.1988983154297, + "loss": 0.6014, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.2411539554595947, + "rewards/margins": 0.3212995231151581, + "rewards/rejected": -1.5624535083770752, + "step": 14410 + }, + { + "epoch": 2.4844934527911784, + "grad_norm": 33.634010314941406, + "learning_rate": 8.728977189962484e-09, + "logits/chosen": -2.2990376949310303, + "logits/rejected": -2.279719591140747, + "logps/chosen": -187.06346130371094, + "logps/rejected": -211.91421508789062, + "loss": 0.6467, + "rewards/accuracies": 0.65625, + "rewards/chosen": -1.3428099155426025, + "rewards/margins": 0.26273879408836365, + "rewards/rejected": -1.6055485010147095, + "step": 14420 + }, + { + "epoch": 2.4862164024810474, + "grad_norm": 38.20600891113281, + "learning_rate": 8.672471561901563e-09, + "logits/chosen": -2.276118278503418, + "logits/rejected": -2.249997615814209, + "logps/chosen": -179.34445190429688, + "logps/rejected": -213.3744659423828, + "loss": 0.5951, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.2632334232330322, + "rewards/margins": 0.37510672211647034, + "rewards/rejected": -1.6383403539657593, + "step": 14430 + }, + { + "epoch": 2.4879393521709168, + "grad_norm": 30.55494499206543, + "learning_rate": 8.616132045853341e-09, + "logits/chosen": -2.274087429046631, + "logits/rejected": -2.2450904846191406, + "logps/chosen": -172.67007446289062, + "logps/rejected": -214.5222930908203, + "loss": 0.5597, + "rewards/accuracies": 0.71875, + "rewards/chosen": -1.1785537004470825, + "rewards/margins": 0.43323737382888794, + "rewards/rejected": -1.6117912530899048, + "step": 14440 + }, + { + "epoch": 2.4896623018607857, + "grad_norm": 44.838504791259766, + "learning_rate": 8.559958868269058e-09, + "logits/chosen": -2.2660040855407715, + "logits/rejected": -2.2459208965301514, + "logps/chosen": -186.3092803955078, + "logps/rejected": -211.8496856689453, + "loss": 0.6032, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -1.3006166219711304, + "rewards/margins": 0.3049980401992798, + "rewards/rejected": -1.6056146621704102, + "step": 14450 + }, + { + "epoch": 2.4913852515506547, + "grad_norm": 27.789527893066406, + "learning_rate": 8.50395225493138e-09, + "logits/chosen": -2.2741141319274902, + "logits/rejected": -2.2550957202911377, + "logps/chosen": -183.6599578857422, + "logps/rejected": -211.07876586914062, + "loss": 0.6189, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -1.291940689086914, + "rewards/margins": 0.29155492782592773, + "rewards/rejected": -1.5834957361221313, + "step": 14460 + }, + { + "epoch": 2.4931082012405237, + "grad_norm": 39.434593200683594, + "learning_rate": 8.448112430953502e-09, + "logits/chosen": -2.379971981048584, + "logits/rejected": -2.3414828777313232, + "logps/chosen": -183.31814575195312, + "logps/rejected": -212.9709930419922, + "loss": 0.5712, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -1.234494686126709, + "rewards/margins": 0.4042988419532776, + "rewards/rejected": -1.6387935876846313, + "step": 14470 + }, + { + "epoch": 2.4948311509303926, + "grad_norm": 33.748207092285156, + "learning_rate": 8.392439620778197e-09, + "logits/chosen": -2.288809299468994, + "logits/rejected": -2.280153274536133, + "logps/chosen": -183.03431701660156, + "logps/rejected": -219.2383575439453, + "loss": 0.5833, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -1.2870408296585083, + "rewards/margins": 0.3628956973552704, + "rewards/rejected": -1.6499366760253906, + "step": 14480 + }, + { + "epoch": 2.496554100620262, + "grad_norm": 28.285213470458984, + "learning_rate": 8.336934048176935e-09, + "logits/chosen": -2.2701704502105713, + "logits/rejected": -2.2498607635498047, + "logps/chosen": -178.55081176757812, + "logps/rejected": -212.88577270507812, + "loss": 0.601, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -1.271669626235962, + "rewards/margins": 0.3485538363456726, + "rewards/rejected": -1.6202234029769897, + "step": 14490 + }, + { + "epoch": 2.498277050310131, + "grad_norm": 31.462501525878906, + "learning_rate": 8.281595936249031e-09, + "logits/chosen": -2.2780776023864746, + "logits/rejected": -2.239255905151367, + "logps/chosen": -178.77101135253906, + "logps/rejected": -212.22274780273438, + "loss": 0.5806, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -1.2120325565338135, + "rewards/margins": 0.37745457887649536, + "rewards/rejected": -1.5894873142242432, + "step": 14500 + }, + { + "epoch": 2.5, + "grad_norm": 35.11719512939453, + "learning_rate": 8.226425507420687e-09, + "logits/chosen": -2.3147332668304443, + "logits/rejected": -2.2924513816833496, + "logps/chosen": -186.06500244140625, + "logps/rejected": -206.14431762695312, + "loss": 0.6319, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -1.296118140220642, + "rewards/margins": 0.25911945104599, + "rewards/rejected": -1.5552375316619873, + "step": 14510 + }, + { + "epoch": 2.501722949689869, + "grad_norm": 35.80636978149414, + "learning_rate": 8.171422983444116e-09, + "logits/chosen": -2.277247190475464, + "logits/rejected": -2.2509655952453613, + "logps/chosen": -183.95880126953125, + "logps/rejected": -209.32876586914062, + "loss": 0.6284, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -1.271933674812317, + "rewards/margins": 0.28048646450042725, + "rewards/rejected": -1.552419900894165, + "step": 14520 + }, + { + "epoch": 2.503445899379738, + "grad_norm": 40.243621826171875, + "learning_rate": 8.11658858539664e-09, + "logits/chosen": -2.3021738529205322, + "logits/rejected": -2.2769148349761963, + "logps/chosen": -186.34942626953125, + "logps/rejected": -218.3411102294922, + "loss": 0.5939, + "rewards/accuracies": 0.71875, + "rewards/chosen": -1.2804752588272095, + "rewards/margins": 0.3546562194824219, + "rewards/rejected": -1.6351312398910522, + "step": 14530 + }, + { + "epoch": 2.505168849069607, + "grad_norm": 25.926076889038086, + "learning_rate": 8.061922533679838e-09, + "logits/chosen": -2.270775556564331, + "logits/rejected": -2.243312358856201, + "logps/chosen": -180.53915405273438, + "logps/rejected": -215.7350311279297, + "loss": 0.586, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -1.251887559890747, + "rewards/margins": 0.39002978801727295, + "rewards/rejected": -1.6419174671173096, + "step": 14540 + }, + { + "epoch": 2.5068917987594763, + "grad_norm": 29.957273483276367, + "learning_rate": 8.007425048018652e-09, + "logits/chosen": -2.3033363819122314, + "logits/rejected": -2.265852689743042, + "logps/chosen": -180.44415283203125, + "logps/rejected": -211.4573211669922, + "loss": 0.5961, + "rewards/accuracies": 0.6187499761581421, + "rewards/chosen": -1.2219346761703491, + "rewards/margins": 0.3654627501964569, + "rewards/rejected": -1.5873974561691284, + "step": 14550 + }, + { + "epoch": 2.5086147484493453, + "grad_norm": 29.57919692993164, + "learning_rate": 7.953096347460442e-09, + "logits/chosen": -2.264911651611328, + "logits/rejected": -2.2368927001953125, + "logps/chosen": -177.29637145996094, + "logps/rejected": -216.89492797851562, + "loss": 0.5777, + "rewards/accuracies": 0.71875, + "rewards/chosen": -1.2392809391021729, + "rewards/margins": 0.3909870386123657, + "rewards/rejected": -1.630267858505249, + "step": 14560 + }, + { + "epoch": 2.5103376981392143, + "grad_norm": 40.99264907836914, + "learning_rate": 7.898936650374177e-09, + "logits/chosen": -2.2209935188293457, + "logits/rejected": -2.211142063140869, + "logps/chosen": -179.51312255859375, + "logps/rejected": -211.30606079101562, + "loss": 0.6259, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -1.2609214782714844, + "rewards/margins": 0.3168264925479889, + "rewards/rejected": -1.5777479410171509, + "step": 14570 + }, + { + "epoch": 2.5120606478290832, + "grad_norm": 36.08527374267578, + "learning_rate": 7.844946174449552e-09, + "logits/chosen": -2.3129754066467285, + "logits/rejected": -2.2868075370788574, + "logps/chosen": -171.59011840820312, + "logps/rejected": -204.12374877929688, + "loss": 0.6058, + "rewards/accuracies": 0.65625, + "rewards/chosen": -1.209629774093628, + "rewards/margins": 0.3304891884326935, + "rewards/rejected": -1.5401188135147095, + "step": 14580 + }, + { + "epoch": 2.5137835975189526, + "grad_norm": 44.74150085449219, + "learning_rate": 7.791125136696053e-09, + "logits/chosen": -2.2585346698760986, + "logits/rejected": -2.2415921688079834, + "logps/chosen": -172.50912475585938, + "logps/rejected": -202.05429077148438, + "loss": 0.6111, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -1.1867698431015015, + "rewards/margins": 0.31644508242607117, + "rewards/rejected": -1.5032150745391846, + "step": 14590 + }, + { + "epoch": 2.5155065472088216, + "grad_norm": 32.48900604248047, + "learning_rate": 7.737473753442175e-09, + "logits/chosen": -2.277561664581299, + "logits/rejected": -2.23915433883667, + "logps/chosen": -178.53311157226562, + "logps/rejected": -214.3824005126953, + "loss": 0.5669, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -1.2278555631637573, + "rewards/margins": 0.4098914563655853, + "rewards/rejected": -1.6377471685409546, + "step": 14600 + }, + { + "epoch": 2.5172294968986906, + "grad_norm": 58.75372314453125, + "learning_rate": 7.683992240334442e-09, + "logits/chosen": -2.257072687149048, + "logits/rejected": -2.2232871055603027, + "logps/chosen": -183.6359100341797, + "logps/rejected": -207.38363647460938, + "loss": 0.6014, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -1.2601312398910522, + "rewards/margins": 0.3229925036430359, + "rewards/rejected": -1.5831239223480225, + "step": 14610 + }, + { + "epoch": 2.5189524465885595, + "grad_norm": 32.79248809814453, + "learning_rate": 7.630680812336666e-09, + "logits/chosen": -2.2399673461914062, + "logits/rejected": -2.2240517139434814, + "logps/chosen": -181.73648071289062, + "logps/rejected": -216.5868682861328, + "loss": 0.6075, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -1.2946263551712036, + "rewards/margins": 0.34417837858200073, + "rewards/rejected": -1.6388046741485596, + "step": 14620 + }, + { + "epoch": 2.5206753962784285, + "grad_norm": 35.11437225341797, + "learning_rate": 7.577539683728963e-09, + "logits/chosen": -2.203411817550659, + "logits/rejected": -2.1906564235687256, + "logps/chosen": -186.56008911132812, + "logps/rejected": -211.3187713623047, + "loss": 0.6224, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -1.3181005716323853, + "rewards/margins": 0.27355271577835083, + "rewards/rejected": -1.5916532278060913, + "step": 14630 + }, + { + "epoch": 2.5223983459682975, + "grad_norm": 27.418834686279297, + "learning_rate": 7.524569068106984e-09, + "logits/chosen": -2.2308120727539062, + "logits/rejected": -2.213721513748169, + "logps/chosen": -178.981201171875, + "logps/rejected": -213.73458862304688, + "loss": 0.5998, + "rewards/accuracies": 0.65625, + "rewards/chosen": -1.2649548053741455, + "rewards/margins": 0.3553503453731537, + "rewards/rejected": -1.620305061340332, + "step": 14640 + }, + { + "epoch": 2.524121295658167, + "grad_norm": 60.733726501464844, + "learning_rate": 7.471769178381032e-09, + "logits/chosen": -2.2828164100646973, + "logits/rejected": -2.261434316635132, + "logps/chosen": -192.31106567382812, + "logps/rejected": -207.40194702148438, + "loss": 0.6721, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.3739750385284424, + "rewards/margins": 0.20933596789836884, + "rewards/rejected": -1.583310842514038, + "step": 14650 + }, + { + "epoch": 2.525844245348036, + "grad_norm": 32.428741455078125, + "learning_rate": 7.419140226775117e-09, + "logits/chosen": -2.2503812313079834, + "logits/rejected": -2.2110300064086914, + "logps/chosen": -178.69284057617188, + "logps/rejected": -217.0933380126953, + "loss": 0.5539, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -1.2397966384887695, + "rewards/margins": 0.4224214553833008, + "rewards/rejected": -1.6622183322906494, + "step": 14660 + }, + { + "epoch": 2.527567195037905, + "grad_norm": 27.9921932220459, + "learning_rate": 7.366682424826259e-09, + "logits/chosen": -2.202364683151245, + "logits/rejected": -2.177262783050537, + "logps/chosen": -171.57933044433594, + "logps/rejected": -207.56698608398438, + "loss": 0.5828, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -1.1934975385665894, + "rewards/margins": 0.36846160888671875, + "rewards/rejected": -1.561959147453308, + "step": 14670 + }, + { + "epoch": 2.529290144727774, + "grad_norm": 43.02341079711914, + "learning_rate": 7.314395983383548e-09, + "logits/chosen": -2.262956142425537, + "logits/rejected": -2.241204261779785, + "logps/chosen": -173.86380004882812, + "logps/rejected": -207.066162109375, + "loss": 0.6014, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -1.2458035945892334, + "rewards/margins": 0.30617329478263855, + "rewards/rejected": -1.5519767999649048, + "step": 14680 + }, + { + "epoch": 2.531013094417643, + "grad_norm": 40.94485092163086, + "learning_rate": 7.262281112607266e-09, + "logits/chosen": -2.2700066566467285, + "logits/rejected": -2.2460155487060547, + "logps/chosen": -189.86940002441406, + "logps/rejected": -227.3620147705078, + "loss": 0.5994, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.3717550039291382, + "rewards/margins": 0.37433767318725586, + "rewards/rejected": -1.7460925579071045, + "step": 14690 + }, + { + "epoch": 2.532736044107512, + "grad_norm": 28.846698760986328, + "learning_rate": 7.210338021968099e-09, + "logits/chosen": -2.333097219467163, + "logits/rejected": -2.307025194168091, + "logps/chosen": -189.25680541992188, + "logps/rejected": -235.3755340576172, + "loss": 0.572, + "rewards/accuracies": 0.71875, + "rewards/chosen": -1.3376073837280273, + "rewards/margins": 0.45716962218284607, + "rewards/rejected": -1.7947769165039062, + "step": 14700 + }, + { + "epoch": 2.534458993797381, + "grad_norm": 39.72157287597656, + "learning_rate": 7.158566920246306e-09, + "logits/chosen": -2.279195547103882, + "logits/rejected": -2.2656617164611816, + "logps/chosen": -181.99972534179688, + "logps/rejected": -213.56253051757812, + "loss": 0.6022, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -1.274493932723999, + "rewards/margins": 0.33153611421585083, + "rewards/rejected": -1.6060301065444946, + "step": 14710 + }, + { + "epoch": 2.53618194348725, + "grad_norm": 40.20477294921875, + "learning_rate": 7.1069680155308455e-09, + "logits/chosen": -2.247303009033203, + "logits/rejected": -2.2131989002227783, + "logps/chosen": -189.83795166015625, + "logps/rejected": -223.70590209960938, + "loss": 0.5967, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -1.3142995834350586, + "rewards/margins": 0.3924974799156189, + "rewards/rejected": -1.7067972421646118, + "step": 14720 + }, + { + "epoch": 2.537904893177119, + "grad_norm": 42.69450759887695, + "learning_rate": 7.055541515218505e-09, + "logits/chosen": -2.315809965133667, + "logits/rejected": -2.2835214138031006, + "logps/chosen": -178.3886260986328, + "logps/rejected": -212.3560333251953, + "loss": 0.5927, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -1.239457368850708, + "rewards/margins": 0.38728493452072144, + "rewards/rejected": -1.6267423629760742, + "step": 14730 + }, + { + "epoch": 2.539627842866988, + "grad_norm": 36.893882751464844, + "learning_rate": 7.004287626013167e-09, + "logits/chosen": -2.2423672676086426, + "logits/rejected": -2.233050584793091, + "logps/chosen": -194.85977172851562, + "logps/rejected": -223.08267211914062, + "loss": 0.63, + "rewards/accuracies": 0.65625, + "rewards/chosen": -1.3785429000854492, + "rewards/margins": 0.3103850781917572, + "rewards/rejected": -1.6889280080795288, + "step": 14740 + }, + { + "epoch": 2.5413507925568575, + "grad_norm": 32.512611389160156, + "learning_rate": 6.9532065539248785e-09, + "logits/chosen": -2.2675812244415283, + "logits/rejected": -2.235931873321533, + "logps/chosen": -180.07955932617188, + "logps/rejected": -211.1636962890625, + "loss": 0.5938, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -1.2650864124298096, + "rewards/margins": 0.3591112792491913, + "rewards/rejected": -1.6241976022720337, + "step": 14750 + }, + { + "epoch": 2.5430737422467264, + "grad_norm": 42.58386993408203, + "learning_rate": 6.902298504269089e-09, + "logits/chosen": -2.30161714553833, + "logits/rejected": -2.2694239616394043, + "logps/chosen": -177.73812866210938, + "logps/rejected": -214.989013671875, + "loss": 0.5785, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -1.2390029430389404, + "rewards/margins": 0.3862294852733612, + "rewards/rejected": -1.625232458114624, + "step": 14760 + }, + { + "epoch": 2.5447966919365954, + "grad_norm": 40.47549057006836, + "learning_rate": 6.851563681665778e-09, + "logits/chosen": -2.292722225189209, + "logits/rejected": -2.2698347568511963, + "logps/chosen": -186.69671630859375, + "logps/rejected": -221.0588836669922, + "loss": 0.606, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -1.3003615140914917, + "rewards/margins": 0.3530925512313843, + "rewards/rejected": -1.6534541845321655, + "step": 14770 + }, + { + "epoch": 2.5465196416264644, + "grad_norm": 34.78125, + "learning_rate": 6.801002290038687e-09, + "logits/chosen": -2.260918140411377, + "logits/rejected": -2.2494137287139893, + "logps/chosen": -177.1928253173828, + "logps/rejected": -208.9444122314453, + "loss": 0.6072, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -1.266132116317749, + "rewards/margins": 0.3350914418697357, + "rewards/rejected": -1.6012235879898071, + "step": 14780 + }, + { + "epoch": 2.548242591316334, + "grad_norm": 28.724706649780273, + "learning_rate": 6.750614532614446e-09, + "logits/chosen": -2.3022968769073486, + "logits/rejected": -2.272913694381714, + "logps/chosen": -194.76040649414062, + "logps/rejected": -222.46749877929688, + "loss": 0.6229, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -1.4062250852584839, + "rewards/margins": 0.3128969073295593, + "rewards/rejected": -1.7191219329833984, + "step": 14790 + }, + { + "epoch": 2.5499655410062028, + "grad_norm": 46.327484130859375, + "learning_rate": 6.7004006119217695e-09, + "logits/chosen": -2.2679340839385986, + "logits/rejected": -2.2513203620910645, + "logps/chosen": -189.68954467773438, + "logps/rejected": -225.81494140625, + "loss": 0.6112, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.3462212085723877, + "rewards/margins": 0.3592769503593445, + "rewards/rejected": -1.7054980993270874, + "step": 14800 + }, + { + "epoch": 2.5499655410062028, + "eval_logits/chosen": -2.3514130115509033, + "eval_logits/rejected": -2.3389508724212646, + "eval_logps/chosen": -169.68975830078125, + "eval_logps/rejected": -191.403564453125, + "eval_loss": 0.650035560131073, + "eval_rewards/accuracies": 0.5971189737319946, + "eval_rewards/chosen": -1.1067428588867188, + "eval_rewards/margins": 0.17979657649993896, + "eval_rewards/rejected": -1.2865396738052368, + "eval_runtime": 384.8181, + "eval_samples_per_second": 11.185, + "eval_steps_per_second": 1.398, + "step": 14800 + }, + { + "epoch": 2.5516884906960717, + "grad_norm": 36.57168197631836, + "learning_rate": 6.650360729790677e-09, + "logits/chosen": -2.305518627166748, + "logits/rejected": -2.2647337913513184, + "logps/chosen": -195.060791015625, + "logps/rejected": -216.7763671875, + "loss": 0.6095, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -1.3535444736480713, + "rewards/margins": 0.3394257128238678, + "rewards/rejected": -1.6929700374603271, + "step": 14810 + }, + { + "epoch": 2.5534114403859407, + "grad_norm": 35.029850006103516, + "learning_rate": 6.600495087351654e-09, + "logits/chosen": -2.399819850921631, + "logits/rejected": -2.366640567779541, + "logps/chosen": -180.87078857421875, + "logps/rejected": -218.5558624267578, + "loss": 0.565, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.2629244327545166, + "rewards/margins": 0.41114553809165955, + "rewards/rejected": -1.6740700006484985, + "step": 14820 + }, + { + "epoch": 2.5551343900758097, + "grad_norm": 47.66520309448242, + "learning_rate": 6.550803885034833e-09, + "logits/chosen": -2.293966770172119, + "logits/rejected": -2.2669548988342285, + "logps/chosen": -184.09866333007812, + "logps/rejected": -209.52108764648438, + "loss": 0.6163, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.2763346433639526, + "rewards/margins": 0.3137095868587494, + "rewards/rejected": -1.5900442600250244, + "step": 14830 + }, + { + "epoch": 2.5568573397656786, + "grad_norm": 27.413667678833008, + "learning_rate": 6.5012873225691875e-09, + "logits/chosen": -2.335608959197998, + "logits/rejected": -2.300196886062622, + "logps/chosen": -185.65924072265625, + "logps/rejected": -224.2730255126953, + "loss": 0.5868, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -1.287444829940796, + "rewards/margins": 0.3980006277561188, + "rewards/rejected": -1.6854454278945923, + "step": 14840 + }, + { + "epoch": 2.558580289455548, + "grad_norm": 34.01144027709961, + "learning_rate": 6.451945598981784e-09, + "logits/chosen": -2.2700634002685547, + "logits/rejected": -2.2440428733825684, + "logps/chosen": -191.76467895507812, + "logps/rejected": -222.75576782226562, + "loss": 0.6082, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -1.36762273311615, + "rewards/margins": 0.32878175377845764, + "rewards/rejected": -1.6964046955108643, + "step": 14850 + }, + { + "epoch": 2.560303239145417, + "grad_norm": 36.0723991394043, + "learning_rate": 6.4027789125969286e-09, + "logits/chosen": -2.251737117767334, + "logits/rejected": -2.2386040687561035, + "logps/chosen": -178.60855102539062, + "logps/rejected": -213.0312042236328, + "loss": 0.5966, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -1.2298336029052734, + "rewards/margins": 0.3910522758960724, + "rewards/rejected": -1.6208856105804443, + "step": 14860 + }, + { + "epoch": 2.562026188835286, + "grad_norm": 35.34622573852539, + "learning_rate": 6.353787461035354e-09, + "logits/chosen": -2.3134000301361084, + "logits/rejected": -2.2874956130981445, + "logps/chosen": -185.47122192382812, + "logps/rejected": -213.2705841064453, + "loss": 0.6239, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -1.2700347900390625, + "rewards/margins": 0.3386613726615906, + "rewards/rejected": -1.6086963415145874, + "step": 14870 + }, + { + "epoch": 2.563749138525155, + "grad_norm": 30.014429092407227, + "learning_rate": 6.304971441213469e-09, + "logits/chosen": -2.2496464252471924, + "logits/rejected": -2.2386934757232666, + "logps/chosen": -175.8731231689453, + "logps/rejected": -212.3821563720703, + "loss": 0.5841, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -1.218999981880188, + "rewards/margins": 0.3805854916572571, + "rewards/rejected": -1.5995855331420898, + "step": 14880 + }, + { + "epoch": 2.5654720882150244, + "grad_norm": 31.44453239440918, + "learning_rate": 6.256331049342572e-09, + "logits/chosen": -2.2204737663269043, + "logits/rejected": -2.2018988132476807, + "logps/chosen": -182.56192016601562, + "logps/rejected": -213.97006225585938, + "loss": 0.6012, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.2417454719543457, + "rewards/margins": 0.33237189054489136, + "rewards/rejected": -1.5741174221038818, + "step": 14890 + }, + { + "epoch": 2.5671950379048933, + "grad_norm": 34.05234146118164, + "learning_rate": 6.207866480928003e-09, + "logits/chosen": -2.202576160430908, + "logits/rejected": -2.170523166656494, + "logps/chosen": -176.09579467773438, + "logps/rejected": -209.90576171875, + "loss": 0.595, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -1.2167494297027588, + "rewards/margins": 0.3754423260688782, + "rewards/rejected": -1.5921916961669922, + "step": 14900 + }, + { + "epoch": 2.5689179875947623, + "grad_norm": 36.98624801635742, + "learning_rate": 6.1595779307684334e-09, + "logits/chosen": -2.273719310760498, + "logits/rejected": -2.2517528533935547, + "logps/chosen": -168.31080627441406, + "logps/rejected": -206.63308715820312, + "loss": 0.5902, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -1.1679550409317017, + "rewards/margins": 0.3669065833091736, + "rewards/rejected": -1.53486168384552, + "step": 14910 + }, + { + "epoch": 2.5706409372846313, + "grad_norm": 31.453224182128906, + "learning_rate": 6.11146559295504e-09, + "logits/chosen": -2.259575366973877, + "logits/rejected": -2.2475197315216064, + "logps/chosen": -177.86105346679688, + "logps/rejected": -215.9774169921875, + "loss": 0.6009, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -1.240506887435913, + "rewards/margins": 0.37624454498291016, + "rewards/rejected": -1.6167514324188232, + "step": 14920 + }, + { + "epoch": 2.5723638869745002, + "grad_norm": 49.65825271606445, + "learning_rate": 6.063529660870709e-09, + "logits/chosen": -2.3555989265441895, + "logits/rejected": -2.328648328781128, + "logps/chosen": -171.65200805664062, + "logps/rejected": -212.0304412841797, + "loss": 0.5651, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -1.1696637868881226, + "rewards/margins": 0.4292203485965729, + "rewards/rejected": -1.5988839864730835, + "step": 14930 + }, + { + "epoch": 2.574086836664369, + "grad_norm": 67.15663146972656, + "learning_rate": 6.015770327189285e-09, + "logits/chosen": -2.2821450233459473, + "logits/rejected": -2.25254487991333, + "logps/chosen": -175.12237548828125, + "logps/rejected": -205.60256958007812, + "loss": 0.5861, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -1.2120064496994019, + "rewards/margins": 0.35235559940338135, + "rewards/rejected": -1.5643621683120728, + "step": 14940 + }, + { + "epoch": 2.575809786354238, + "grad_norm": 57.31182098388672, + "learning_rate": 5.968187783874806e-09, + "logits/chosen": -2.35475492477417, + "logits/rejected": -2.3296236991882324, + "logps/chosen": -181.62442016601562, + "logps/rejected": -206.0255889892578, + "loss": 0.6176, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -1.2485582828521729, + "rewards/margins": 0.3184163570404053, + "rewards/rejected": -1.566974401473999, + "step": 14950 + }, + { + "epoch": 2.5775327360441076, + "grad_norm": 38.26123046875, + "learning_rate": 5.920782222180748e-09, + "logits/chosen": -2.2558817863464355, + "logits/rejected": -2.2312397956848145, + "logps/chosen": -184.12100219726562, + "logps/rejected": -211.8404083251953, + "loss": 0.6204, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -1.2677650451660156, + "rewards/margins": 0.3114776611328125, + "rewards/rejected": -1.5792428255081177, + "step": 14960 + }, + { + "epoch": 2.5792556857339766, + "grad_norm": 48.023014068603516, + "learning_rate": 5.873553832649137e-09, + "logits/chosen": -2.3023171424865723, + "logits/rejected": -2.2697367668151855, + "logps/chosen": -179.10989379882812, + "logps/rejected": -214.2508087158203, + "loss": 0.5974, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -1.2418620586395264, + "rewards/margins": 0.3722601532936096, + "rewards/rejected": -1.6141221523284912, + "step": 14970 + }, + { + "epoch": 2.5809786354238455, + "grad_norm": 44.4215202331543, + "learning_rate": 5.826502805109956e-09, + "logits/chosen": -2.323770046234131, + "logits/rejected": -2.2797865867614746, + "logps/chosen": -179.7855987548828, + "logps/rejected": -224.40115356445312, + "loss": 0.5421, + "rewards/accuracies": 0.7562500238418579, + "rewards/chosen": -1.2483810186386108, + "rewards/margins": 0.4901328682899475, + "rewards/rejected": -1.7385139465332031, + "step": 14980 + }, + { + "epoch": 2.582701585113715, + "grad_norm": 30.38766098022461, + "learning_rate": 5.779629328680275e-09, + "logits/chosen": -2.3169326782226562, + "logits/rejected": -2.3045449256896973, + "logps/chosen": -172.8456573486328, + "logps/rejected": -216.1270294189453, + "loss": 0.568, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -1.205559492111206, + "rewards/margins": 0.4353708326816559, + "rewards/rejected": -1.640929937362671, + "step": 14990 + }, + { + "epoch": 2.584424534803584, + "grad_norm": 30.998842239379883, + "learning_rate": 5.732933591763495e-09, + "logits/chosen": -2.319552183151245, + "logits/rejected": -2.304553985595703, + "logps/chosen": -180.916259765625, + "logps/rejected": -208.8544158935547, + "loss": 0.607, + "rewards/accuracies": 0.65625, + "rewards/chosen": -1.2423069477081299, + "rewards/margins": 0.3127003312110901, + "rewards/rejected": -1.5550072193145752, + "step": 15000 + }, + { + "epoch": 2.586147484493453, + "grad_norm": 41.17787551879883, + "learning_rate": 5.686415782048643e-09, + "logits/chosen": -2.3234753608703613, + "logits/rejected": -2.2967770099639893, + "logps/chosen": -178.34402465820312, + "logps/rejected": -210.7972412109375, + "loss": 0.6112, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -1.225295066833496, + "rewards/margins": 0.3530086874961853, + "rewards/rejected": -1.5783039331436157, + "step": 15010 + }, + { + "epoch": 2.587870434183322, + "grad_norm": 52.57218933105469, + "learning_rate": 5.640076086509538e-09, + "logits/chosen": -2.246558666229248, + "logits/rejected": -2.2387757301330566, + "logps/chosen": -174.56594848632812, + "logps/rejected": -207.66690063476562, + "loss": 0.6215, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -1.2340904474258423, + "rewards/margins": 0.32109197974205017, + "rewards/rejected": -1.5551823377609253, + "step": 15020 + }, + { + "epoch": 2.589593383873191, + "grad_norm": 35.95048522949219, + "learning_rate": 5.593914691404145e-09, + "logits/chosen": -2.261404275894165, + "logits/rejected": -2.235604763031006, + "logps/chosen": -180.11178588867188, + "logps/rejected": -212.8577117919922, + "loss": 0.6168, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -1.2472907304763794, + "rewards/margins": 0.3403995633125305, + "rewards/rejected": -1.5876904726028442, + "step": 15030 + }, + { + "epoch": 2.59131633356306, + "grad_norm": 48.30842208862305, + "learning_rate": 5.547931782273718e-09, + "logits/chosen": -2.2906880378723145, + "logits/rejected": -2.2671432495117188, + "logps/chosen": -187.49795532226562, + "logps/rejected": -211.20700073242188, + "loss": 0.6294, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -1.322435975074768, + "rewards/margins": 0.27619048953056335, + "rewards/rejected": -1.5986262559890747, + "step": 15040 + }, + { + "epoch": 2.5930392832529288, + "grad_norm": 47.93348693847656, + "learning_rate": 5.5021275439421365e-09, + "logits/chosen": -2.3172030448913574, + "logits/rejected": -2.278430461883545, + "logps/chosen": -175.37673950195312, + "logps/rejected": -205.2795867919922, + "loss": 0.5773, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -1.1954820156097412, + "rewards/margins": 0.37430500984191895, + "rewards/rejected": -1.5697870254516602, + "step": 15050 + }, + { + "epoch": 2.594762232942798, + "grad_norm": 44.634864807128906, + "learning_rate": 5.456502160515097e-09, + "logits/chosen": -2.2572150230407715, + "logits/rejected": -2.2382962703704834, + "logps/chosen": -173.3562774658203, + "logps/rejected": -206.46798706054688, + "loss": 0.6115, + "rewards/accuracies": 0.65625, + "rewards/chosen": -1.1877930164337158, + "rewards/margins": 0.33759254217147827, + "rewards/rejected": -1.5253856182098389, + "step": 15060 + }, + { + "epoch": 2.596485182632667, + "grad_norm": 33.96257019042969, + "learning_rate": 5.411055815379451e-09, + "logits/chosen": -2.327104091644287, + "logits/rejected": -2.28928804397583, + "logps/chosen": -177.70382690429688, + "logps/rejected": -201.44186401367188, + "loss": 0.6114, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -1.1931462287902832, + "rewards/margins": 0.3145656883716583, + "rewards/rejected": -1.5077118873596191, + "step": 15070 + }, + { + "epoch": 2.598208132322536, + "grad_norm": 40.272865295410156, + "learning_rate": 5.365788691202372e-09, + "logits/chosen": -2.2955873012542725, + "logits/rejected": -2.2718987464904785, + "logps/chosen": -174.05657958984375, + "logps/rejected": -207.31423950195312, + "loss": 0.5995, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -1.1952232122421265, + "rewards/margins": 0.3444897532463074, + "rewards/rejected": -1.5397131443023682, + "step": 15080 + }, + { + "epoch": 2.599931082012405, + "grad_norm": 29.57343101501465, + "learning_rate": 5.320700969930708e-09, + "logits/chosen": -2.3112263679504395, + "logits/rejected": -2.2815232276916504, + "logps/chosen": -172.55918884277344, + "logps/rejected": -201.70616149902344, + "loss": 0.606, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -1.1735479831695557, + "rewards/margins": 0.3192867338657379, + "rewards/rejected": -1.4928348064422607, + "step": 15090 + }, + { + "epoch": 2.6016540317022745, + "grad_norm": 46.08125686645508, + "learning_rate": 5.2757928327902324e-09, + "logits/chosen": -2.256523609161377, + "logits/rejected": -2.2312490940093994, + "logps/chosen": -169.47898864746094, + "logps/rejected": -201.4627685546875, + "loss": 0.5938, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -1.1771475076675415, + "rewards/margins": 0.33779600262641907, + "rewards/rejected": -1.5149434804916382, + "step": 15100 + }, + { + "epoch": 2.6033769813921435, + "grad_norm": 37.76089859008789, + "learning_rate": 5.231064460284818e-09, + "logits/chosen": -2.286766767501831, + "logits/rejected": -2.267291307449341, + "logps/chosen": -179.2318572998047, + "logps/rejected": -202.9172821044922, + "loss": 0.6217, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -1.2481911182403564, + "rewards/margins": 0.2732866406440735, + "rewards/rejected": -1.5214776992797852, + "step": 15110 + }, + { + "epoch": 2.6050999310820124, + "grad_norm": 34.025146484375, + "learning_rate": 5.1865160321958646e-09, + "logits/chosen": -2.2730836868286133, + "logits/rejected": -2.2554898262023926, + "logps/chosen": -189.20458984375, + "logps/rejected": -215.56680297851562, + "loss": 0.623, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.3072446584701538, + "rewards/margins": 0.2902025282382965, + "rewards/rejected": -1.597447395324707, + "step": 15120 + }, + { + "epoch": 2.6068228807718814, + "grad_norm": 41.7362060546875, + "learning_rate": 5.142147727581498e-09, + "logits/chosen": -2.257009983062744, + "logits/rejected": -2.2283775806427, + "logps/chosen": -172.26864624023438, + "logps/rejected": -204.14013671875, + "loss": 0.5864, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -1.178093671798706, + "rewards/margins": 0.3746481239795685, + "rewards/rejected": -1.5527417659759521, + "step": 15130 + }, + { + "epoch": 2.6085458304617504, + "grad_norm": 36.35040283203125, + "learning_rate": 5.097959724775819e-09, + "logits/chosen": -2.2880465984344482, + "logits/rejected": -2.2623438835144043, + "logps/chosen": -175.53457641601562, + "logps/rejected": -214.331787109375, + "loss": 0.5709, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -1.1967875957489014, + "rewards/margins": 0.38674241304397583, + "rewards/rejected": -1.5835299491882324, + "step": 15140 + }, + { + "epoch": 2.6102687801516193, + "grad_norm": 32.46341323852539, + "learning_rate": 5.053952201388234e-09, + "logits/chosen": -2.3879854679107666, + "logits/rejected": -2.354921817779541, + "logps/chosen": -173.9727020263672, + "logps/rejected": -206.2282257080078, + "loss": 0.6065, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -1.1880525350570679, + "rewards/margins": 0.3559701442718506, + "rewards/rejected": -1.5440226793289185, + "step": 15150 + }, + { + "epoch": 2.6119917298414888, + "grad_norm": 43.05264663696289, + "learning_rate": 5.010125334302745e-09, + "logits/chosen": -2.2540364265441895, + "logits/rejected": -2.2353858947753906, + "logps/chosen": -171.33876037597656, + "logps/rejected": -208.04934692382812, + "loss": 0.5753, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -1.1556618213653564, + "rewards/margins": 0.3874899744987488, + "rewards/rejected": -1.54315185546875, + "step": 15160 + }, + { + "epoch": 2.6137146795313577, + "grad_norm": 32.51591110229492, + "learning_rate": 4.9664792996772285e-09, + "logits/chosen": -2.247185230255127, + "logits/rejected": -2.2258715629577637, + "logps/chosen": -167.01535034179688, + "logps/rejected": -204.393798828125, + "loss": 0.5853, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.1358674764633179, + "rewards/margins": 0.37841638922691345, + "rewards/rejected": -1.5142838954925537, + "step": 15170 + }, + { + "epoch": 2.6154376292212267, + "grad_norm": 30.162477493286133, + "learning_rate": 4.923014272942688e-09, + "logits/chosen": -2.3051016330718994, + "logits/rejected": -2.293006420135498, + "logps/chosen": -183.97933959960938, + "logps/rejected": -219.2945556640625, + "loss": 0.6015, + "rewards/accuracies": 0.65625, + "rewards/chosen": -1.271348237991333, + "rewards/margins": 0.37271079421043396, + "rewards/rejected": -1.644059181213379, + "step": 15180 + }, + { + "epoch": 2.6171605789110957, + "grad_norm": 41.51902389526367, + "learning_rate": 4.87973042880262e-09, + "logits/chosen": -2.25746488571167, + "logits/rejected": -2.24119234085083, + "logps/chosen": -172.4810333251953, + "logps/rejected": -201.8636016845703, + "loss": 0.5945, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -1.187237024307251, + "rewards/margins": 0.3163180947303772, + "rewards/rejected": -1.5035550594329834, + "step": 15190 + }, + { + "epoch": 2.618883528600965, + "grad_norm": 26.529691696166992, + "learning_rate": 4.836627941232252e-09, + "logits/chosen": -2.3084750175476074, + "logits/rejected": -2.2735514640808105, + "logps/chosen": -176.02841186523438, + "logps/rejected": -210.1353759765625, + "loss": 0.5773, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.1953575611114502, + "rewards/margins": 0.3805798590183258, + "rewards/rejected": -1.5759375095367432, + "step": 15200 + }, + { + "epoch": 2.618883528600965, + "eval_logits/chosen": -2.358809471130371, + "eval_logits/rejected": -2.346810817718506, + "eval_logps/chosen": -163.36053466796875, + "eval_logps/rejected": -184.21234130859375, + "eval_loss": 0.650810182094574, + "eval_rewards/accuracies": 0.6024628281593323, + "eval_rewards/chosen": -1.0434505939483643, + "eval_rewards/margins": 0.17117682099342346, + "eval_rewards/rejected": -1.2146275043487549, + "eval_runtime": 384.5077, + "eval_samples_per_second": 11.194, + "eval_steps_per_second": 1.399, + "step": 15200 + }, + { + "epoch": 2.620606478290834, + "grad_norm": 39.072540283203125, + "learning_rate": 4.793706983477869e-09, + "logits/chosen": -2.226433277130127, + "logits/rejected": -2.1847689151763916, + "logps/chosen": -183.5162353515625, + "logps/rejected": -212.4745330810547, + "loss": 0.6092, + "rewards/accuracies": 0.65625, + "rewards/chosen": -1.2674076557159424, + "rewards/margins": 0.343503475189209, + "rewards/rejected": -1.6109111309051514, + "step": 15210 + }, + { + "epoch": 2.622329427980703, + "grad_norm": 44.00055694580078, + "learning_rate": 4.750967728056127e-09, + "logits/chosen": -2.234224557876587, + "logits/rejected": -2.1999948024749756, + "logps/chosen": -167.09426879882812, + "logps/rejected": -203.2758331298828, + "loss": 0.5562, + "rewards/accuracies": 0.768750011920929, + "rewards/chosen": -1.1392322778701782, + "rewards/margins": 0.40779203176498413, + "rewards/rejected": -1.5470244884490967, + "step": 15220 + }, + { + "epoch": 2.624052377670572, + "grad_norm": 47.231117248535156, + "learning_rate": 4.7084103467533384e-09, + "logits/chosen": -2.258369207382202, + "logits/rejected": -2.2343966960906982, + "logps/chosen": -180.4640655517578, + "logps/rejected": -211.6443634033203, + "loss": 0.6035, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -1.2449473142623901, + "rewards/margins": 0.3434852957725525, + "rewards/rejected": -1.5884325504302979, + "step": 15230 + }, + { + "epoch": 2.625775327360441, + "grad_norm": 45.69564437866211, + "learning_rate": 4.666035010624797e-09, + "logits/chosen": -2.2520008087158203, + "logits/rejected": -2.213196277618408, + "logps/chosen": -179.78683471679688, + "logps/rejected": -211.46395874023438, + "loss": 0.5779, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -1.1973354816436768, + "rewards/margins": 0.38800281286239624, + "rewards/rejected": -1.5853383541107178, + "step": 15240 + }, + { + "epoch": 2.62749827705031, + "grad_norm": 43.75592803955078, + "learning_rate": 4.623841889994057e-09, + "logits/chosen": -2.2950878143310547, + "logits/rejected": -2.2744078636169434, + "logps/chosen": -170.07020568847656, + "logps/rejected": -206.53176879882812, + "loss": 0.5915, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -1.1582139730453491, + "rewards/margins": 0.37098950147628784, + "rewards/rejected": -1.5292034149169922, + "step": 15250 + }, + { + "epoch": 2.6292212267401793, + "grad_norm": 33.61025619506836, + "learning_rate": 4.581831154452304e-09, + "logits/chosen": -2.254704475402832, + "logits/rejected": -2.232483386993408, + "logps/chosen": -179.52354431152344, + "logps/rejected": -203.71212768554688, + "loss": 0.6168, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.2406651973724365, + "rewards/margins": 0.30228352546691895, + "rewards/rejected": -1.5429487228393555, + "step": 15260 + }, + { + "epoch": 2.6309441764300483, + "grad_norm": 39.85773468017578, + "learning_rate": 4.540002972857654e-09, + "logits/chosen": -2.2991280555725098, + "logits/rejected": -2.254185914993286, + "logps/chosen": -192.9390411376953, + "logps/rejected": -221.02963256835938, + "loss": 0.6173, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -1.371813416481018, + "rewards/margins": 0.348710298538208, + "rewards/rejected": -1.7205238342285156, + "step": 15270 + }, + { + "epoch": 2.6326671261199173, + "grad_norm": 41.84758377075195, + "learning_rate": 4.498357513334433e-09, + "logits/chosen": -2.3626158237457275, + "logits/rejected": -2.3408305644989014, + "logps/chosen": -176.02212524414062, + "logps/rejected": -209.27206420898438, + "loss": 0.5924, + "rewards/accuracies": 0.65625, + "rewards/chosen": -1.2225972414016724, + "rewards/margins": 0.33926254510879517, + "rewards/rejected": -1.5618598461151123, + "step": 15280 + }, + { + "epoch": 2.6343900758097862, + "grad_norm": 28.77754020690918, + "learning_rate": 4.456894943272532e-09, + "logits/chosen": -2.285330295562744, + "logits/rejected": -2.2468972206115723, + "logps/chosen": -179.88723754882812, + "logps/rejected": -220.12503051757812, + "loss": 0.5744, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -1.243018388748169, + "rewards/margins": 0.4545852243900299, + "rewards/rejected": -1.6976035833358765, + "step": 15290 + }, + { + "epoch": 2.6361130254996556, + "grad_norm": 44.45974349975586, + "learning_rate": 4.415615429326769e-09, + "logits/chosen": -2.2029223442077637, + "logits/rejected": -2.167741060256958, + "logps/chosen": -179.9371337890625, + "logps/rejected": -213.7763214111328, + "loss": 0.6049, + "rewards/accuracies": 0.65625, + "rewards/chosen": -1.27082359790802, + "rewards/margins": 0.3803272247314453, + "rewards/rejected": -1.6511509418487549, + "step": 15300 + }, + { + "epoch": 2.6378359751895246, + "grad_norm": 31.738445281982422, + "learning_rate": 4.374519137416172e-09, + "logits/chosen": -2.324070692062378, + "logits/rejected": -2.2945971488952637, + "logps/chosen": -177.03167724609375, + "logps/rejected": -209.97781372070312, + "loss": 0.5879, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -1.21383798122406, + "rewards/margins": 0.3603143095970154, + "rewards/rejected": -1.5741521120071411, + "step": 15310 + }, + { + "epoch": 2.6395589248793936, + "grad_norm": 45.1463508605957, + "learning_rate": 4.333606232723308e-09, + "logits/chosen": -2.2548727989196777, + "logits/rejected": -2.2461423873901367, + "logps/chosen": -176.7523193359375, + "logps/rejected": -208.3860321044922, + "loss": 0.6224, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.2468162775039673, + "rewards/margins": 0.31269317865371704, + "rewards/rejected": -1.559509515762329, + "step": 15320 + }, + { + "epoch": 2.6412818745692626, + "grad_norm": 32.08575439453125, + "learning_rate": 4.292876879693646e-09, + "logits/chosen": -2.2762961387634277, + "logits/rejected": -2.247575283050537, + "logps/chosen": -179.0817413330078, + "logps/rejected": -213.4488067626953, + "loss": 0.5843, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -1.225376844406128, + "rewards/margins": 0.37693697214126587, + "rewards/rejected": -1.602313756942749, + "step": 15330 + }, + { + "epoch": 2.6430048242591315, + "grad_norm": 25.77557945251465, + "learning_rate": 4.252331242034912e-09, + "logits/chosen": -2.2798683643341064, + "logits/rejected": -2.2595818042755127, + "logps/chosen": -181.40280151367188, + "logps/rejected": -215.9775390625, + "loss": 0.5956, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -1.2631257772445679, + "rewards/margins": 0.35496434569358826, + "rewards/rejected": -1.618090271949768, + "step": 15340 + }, + { + "epoch": 2.6447277739490005, + "grad_norm": 33.43914031982422, + "learning_rate": 4.211969482716354e-09, + "logits/chosen": -2.2000720500946045, + "logits/rejected": -2.1807875633239746, + "logps/chosen": -178.42819213867188, + "logps/rejected": -214.453857421875, + "loss": 0.5883, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -1.2338414192199707, + "rewards/margins": 0.38494712114334106, + "rewards/rejected": -1.618788480758667, + "step": 15350 + }, + { + "epoch": 2.64645072363887, + "grad_norm": 37.96018600463867, + "learning_rate": 4.171791763968191e-09, + "logits/chosen": -2.2911009788513184, + "logits/rejected": -2.2742550373077393, + "logps/chosen": -176.56112670898438, + "logps/rejected": -210.6365966796875, + "loss": 0.6081, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -1.2332837581634521, + "rewards/margins": 0.3278941512107849, + "rewards/rejected": -1.5611779689788818, + "step": 15360 + }, + { + "epoch": 2.648173673328739, + "grad_norm": 31.777172088623047, + "learning_rate": 4.131798247280882e-09, + "logits/chosen": -2.303417921066284, + "logits/rejected": -2.2715156078338623, + "logps/chosen": -180.98085021972656, + "logps/rejected": -208.2545166015625, + "loss": 0.614, + "rewards/accuracies": 0.65625, + "rewards/chosen": -1.2401424646377563, + "rewards/margins": 0.3373335897922516, + "rewards/rejected": -1.577476143836975, + "step": 15370 + }, + { + "epoch": 2.649896623018608, + "grad_norm": 30.086198806762695, + "learning_rate": 4.091989093404513e-09, + "logits/chosen": -2.2850561141967773, + "logits/rejected": -2.261882781982422, + "logps/chosen": -177.33334350585938, + "logps/rejected": -215.7422637939453, + "loss": 0.5643, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -1.2238959074020386, + "rewards/margins": 0.42093032598495483, + "rewards/rejected": -1.6448261737823486, + "step": 15380 + }, + { + "epoch": 2.651619572708477, + "grad_norm": 27.287952423095703, + "learning_rate": 4.052364462348118e-09, + "logits/chosen": -2.3435025215148926, + "logits/rejected": -2.3281779289245605, + "logps/chosen": -178.9251708984375, + "logps/rejected": -215.21640014648438, + "loss": 0.5895, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -1.2218642234802246, + "rewards/margins": 0.38189610838890076, + "rewards/rejected": -1.6037603616714478, + "step": 15390 + }, + { + "epoch": 2.6533425223983462, + "grad_norm": 29.21660804748535, + "learning_rate": 4.01292451337909e-09, + "logits/chosen": -2.303785562515259, + "logits/rejected": -2.2735157012939453, + "logps/chosen": -189.47283935546875, + "logps/rejected": -205.9953155517578, + "loss": 0.6489, + "rewards/accuracies": 0.65625, + "rewards/chosen": -1.3067653179168701, + "rewards/margins": 0.2470887005329132, + "rewards/rejected": -1.5538541078567505, + "step": 15400 + }, + { + "epoch": 2.655065472088215, + "grad_norm": 40.113216400146484, + "learning_rate": 3.973669405022518e-09, + "logits/chosen": -2.2873952388763428, + "logits/rejected": -2.2475345134735107, + "logps/chosen": -190.634521484375, + "logps/rejected": -208.83047485351562, + "loss": 0.6316, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -1.2999731302261353, + "rewards/margins": 0.28097444772720337, + "rewards/rejected": -1.5809476375579834, + "step": 15410 + }, + { + "epoch": 2.656788421778084, + "grad_norm": 32.246517181396484, + "learning_rate": 3.934599295060481e-09, + "logits/chosen": -2.2759604454040527, + "logits/rejected": -2.251339912414551, + "logps/chosen": -172.6602783203125, + "logps/rejected": -220.6236114501953, + "loss": 0.5528, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": -1.1907905340194702, + "rewards/margins": 0.49050837755203247, + "rewards/rejected": -1.6812989711761475, + "step": 15420 + }, + { + "epoch": 2.658511371467953, + "grad_norm": 28.570491790771484, + "learning_rate": 3.895714340531542e-09, + "logits/chosen": -2.351717710494995, + "logits/rejected": -2.320129871368408, + "logps/chosen": -183.42445373535156, + "logps/rejected": -215.3601531982422, + "loss": 0.5694, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -1.2372651100158691, + "rewards/margins": 0.38909897208213806, + "rewards/rejected": -1.6263641119003296, + "step": 15430 + }, + { + "epoch": 2.660234321157822, + "grad_norm": 33.97868728637695, + "learning_rate": 3.857014697730027e-09, + "logits/chosen": -2.372708559036255, + "logits/rejected": -2.3452935218811035, + "logps/chosen": -174.3719940185547, + "logps/rejected": -203.66500854492188, + "loss": 0.597, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -1.1882606744766235, + "rewards/margins": 0.3174138069152832, + "rewards/rejected": -1.5056743621826172, + "step": 15440 + }, + { + "epoch": 2.661957270847691, + "grad_norm": 33.95857620239258, + "learning_rate": 3.818500522205392e-09, + "logits/chosen": -2.1778063774108887, + "logits/rejected": -2.151001453399658, + "logps/chosen": -177.09414672851562, + "logps/rejected": -211.31924438476562, + "loss": 0.5925, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -1.2400333881378174, + "rewards/margins": 0.3476566672325134, + "rewards/rejected": -1.5876901149749756, + "step": 15450 + }, + { + "epoch": 2.66368022053756, + "grad_norm": 37.55846405029297, + "learning_rate": 3.7801719687616805e-09, + "logits/chosen": -2.3410890102386475, + "logits/rejected": -2.323452949523926, + "logps/chosen": -179.95013427734375, + "logps/rejected": -208.1508331298828, + "loss": 0.6136, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -1.1833778619766235, + "rewards/margins": 0.32879582047462463, + "rewards/rejected": -1.5121737718582153, + "step": 15460 + }, + { + "epoch": 2.6654031702274295, + "grad_norm": 47.06980895996094, + "learning_rate": 3.742029191456792e-09, + "logits/chosen": -2.3284058570861816, + "logits/rejected": -2.3038089275360107, + "logps/chosen": -194.41659545898438, + "logps/rejected": -222.1026611328125, + "loss": 0.6121, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -1.331230878829956, + "rewards/margins": 0.34454160928726196, + "rewards/rejected": -1.6757726669311523, + "step": 15470 + }, + { + "epoch": 2.6671261199172984, + "grad_norm": 31.516748428344727, + "learning_rate": 3.704072343601955e-09, + "logits/chosen": -2.3013763427734375, + "logits/rejected": -2.269360303878784, + "logps/chosen": -174.3995361328125, + "logps/rejected": -203.1074676513672, + "loss": 0.6133, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -1.1772360801696777, + "rewards/margins": 0.342081218957901, + "rewards/rejected": -1.519317388534546, + "step": 15480 + }, + { + "epoch": 2.6688490696071674, + "grad_norm": 48.16901397705078, + "learning_rate": 3.666301577761033e-09, + "logits/chosen": -2.2873353958129883, + "logits/rejected": -2.272728681564331, + "logps/chosen": -179.4340057373047, + "logps/rejected": -204.6569061279297, + "loss": 0.6174, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -1.2381466627120972, + "rewards/margins": 0.2938258647918701, + "rewards/rejected": -1.5319725275039673, + "step": 15490 + }, + { + "epoch": 2.670572019297037, + "grad_norm": 37.944313049316406, + "learning_rate": 3.628717045750007e-09, + "logits/chosen": -2.2588024139404297, + "logits/rejected": -2.2421507835388184, + "logps/chosen": -191.25660705566406, + "logps/rejected": -216.8999786376953, + "loss": 0.6351, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -1.3631250858306885, + "rewards/margins": 0.26649603247642517, + "rewards/rejected": -1.629621148109436, + "step": 15500 + }, + { + "epoch": 2.6722949689869058, + "grad_norm": 34.96455001831055, + "learning_rate": 3.591318898636253e-09, + "logits/chosen": -2.2264561653137207, + "logits/rejected": -2.196096897125244, + "logps/chosen": -182.52011108398438, + "logps/rejected": -215.2834930419922, + "loss": 0.5861, + "rewards/accuracies": 0.71875, + "rewards/chosen": -1.2337663173675537, + "rewards/margins": 0.38769131898880005, + "rewards/rejected": -1.6214576959609985, + "step": 15510 + }, + { + "epoch": 2.6740179186767747, + "grad_norm": 35.925941467285156, + "learning_rate": 3.5541072867380174e-09, + "logits/chosen": -2.216661214828491, + "logits/rejected": -2.1927154064178467, + "logps/chosen": -180.17608642578125, + "logps/rejected": -206.7606201171875, + "loss": 0.6073, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -1.2516624927520752, + "rewards/margins": 0.32696646451950073, + "rewards/rejected": -1.5786290168762207, + "step": 15520 + }, + { + "epoch": 2.6757408683666437, + "grad_norm": 30.07876968383789, + "learning_rate": 3.5170823596237852e-09, + "logits/chosen": -2.228799343109131, + "logits/rejected": -2.1991519927978516, + "logps/chosen": -167.81399536132812, + "logps/rejected": -206.71530151367188, + "loss": 0.563, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -1.1289836168289185, + "rewards/margins": 0.41038280725479126, + "rewards/rejected": -1.539366364479065, + "step": 15530 + }, + { + "epoch": 2.6774638180565127, + "grad_norm": 27.47809600830078, + "learning_rate": 3.480244266111687e-09, + "logits/chosen": -2.2589917182922363, + "logits/rejected": -2.2270827293395996, + "logps/chosen": -182.40170288085938, + "logps/rejected": -214.64804077148438, + "loss": 0.614, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -1.2905007600784302, + "rewards/margins": 0.3463669419288635, + "rewards/rejected": -1.6368677616119385, + "step": 15540 + }, + { + "epoch": 2.6791867677463816, + "grad_norm": 30.874013900756836, + "learning_rate": 3.4435931542688813e-09, + "logits/chosen": -2.342874050140381, + "logits/rejected": -2.313809633255005, + "logps/chosen": -183.68667602539062, + "logps/rejected": -216.19583129882812, + "loss": 0.5916, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -1.2788660526275635, + "rewards/margins": 0.3673045039176941, + "rewards/rejected": -1.6461708545684814, + "step": 15550 + }, + { + "epoch": 2.6809097174362506, + "grad_norm": 34.93791580200195, + "learning_rate": 3.407129171410966e-09, + "logits/chosen": -2.280003786087036, + "logits/rejected": -2.270197629928589, + "logps/chosen": -179.6429443359375, + "logps/rejected": -204.54757690429688, + "loss": 0.6461, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -1.2780966758728027, + "rewards/margins": 0.24179551005363464, + "rewards/rejected": -1.5198920965194702, + "step": 15560 + }, + { + "epoch": 2.68263266712612, + "grad_norm": 34.73818588256836, + "learning_rate": 3.3708524641014034e-09, + "logits/chosen": -2.321105718612671, + "logits/rejected": -2.294581890106201, + "logps/chosen": -188.01690673828125, + "logps/rejected": -217.1805877685547, + "loss": 0.6057, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -1.3426463603973389, + "rewards/margins": 0.33262020349502563, + "rewards/rejected": -1.6752665042877197, + "step": 15570 + }, + { + "epoch": 2.684355616815989, + "grad_norm": 37.94607162475586, + "learning_rate": 3.3347631781509344e-09, + "logits/chosen": -2.3370888233184814, + "logits/rejected": -2.317061185836792, + "logps/chosen": -182.791748046875, + "logps/rejected": -211.9905548095703, + "loss": 0.6156, + "rewards/accuracies": 0.65625, + "rewards/chosen": -1.2602826356887817, + "rewards/margins": 0.29799336194992065, + "rewards/rejected": -1.5582760572433472, + "step": 15580 + }, + { + "epoch": 2.686078566505858, + "grad_norm": 36.847930908203125, + "learning_rate": 3.298861458616947e-09, + "logits/chosen": -2.2724945545196533, + "logits/rejected": -2.2520852088928223, + "logps/chosen": -175.23680114746094, + "logps/rejected": -197.07168579101562, + "loss": 0.6402, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -1.1939842700958252, + "rewards/margins": 0.24006938934326172, + "rewards/rejected": -1.4340537786483765, + "step": 15590 + }, + { + "epoch": 2.687801516195727, + "grad_norm": 30.203340530395508, + "learning_rate": 3.263147449802939e-09, + "logits/chosen": -2.285566568374634, + "logits/rejected": -2.2599756717681885, + "logps/chosen": -181.5286865234375, + "logps/rejected": -216.1159210205078, + "loss": 0.5983, + "rewards/accuracies": 0.65625, + "rewards/chosen": -1.2878538370132446, + "rewards/margins": 0.3697057366371155, + "rewards/rejected": -1.6575596332550049, + "step": 15600 + }, + { + "epoch": 2.687801516195727, + "eval_logits/chosen": -2.354008913040161, + "eval_logits/rejected": -2.3419225215911865, + "eval_logps/chosen": -165.61573791503906, + "eval_logps/rejected": -186.71853637695312, + "eval_loss": 0.6505388021469116, + "eval_rewards/accuracies": 0.6017658114433289, + "eval_rewards/chosen": -1.0660027265548706, + "eval_rewards/margins": 0.17368650436401367, + "eval_rewards/rejected": -1.2396892309188843, + "eval_runtime": 384.7663, + "eval_samples_per_second": 11.186, + "eval_steps_per_second": 1.398, + "step": 15600 + }, + { + "epoch": 2.6895244658855963, + "grad_norm": 39.31675338745117, + "learning_rate": 3.227621295257921e-09, + "logits/chosen": -2.342576026916504, + "logits/rejected": -2.3186194896698, + "logps/chosen": -183.08203125, + "logps/rejected": -214.7406463623047, + "loss": 0.6001, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -1.2608639001846313, + "rewards/margins": 0.34865862131118774, + "rewards/rejected": -1.6095225811004639, + "step": 15610 + }, + { + "epoch": 2.6912474155754653, + "grad_norm": 54.18916702270508, + "learning_rate": 3.1922831377758586e-09, + "logits/chosen": -2.252718448638916, + "logits/rejected": -2.2375519275665283, + "logps/chosen": -167.72531127929688, + "logps/rejected": -209.5613555908203, + "loss": 0.5654, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -1.1279845237731934, + "rewards/margins": 0.41473323106765747, + "rewards/rejected": -1.542717695236206, + "step": 15620 + }, + { + "epoch": 2.6929703652653343, + "grad_norm": 29.02688217163086, + "learning_rate": 3.1571331193950444e-09, + "logits/chosen": -2.2631161212921143, + "logits/rejected": -2.2219128608703613, + "logps/chosen": -184.05958557128906, + "logps/rejected": -218.73251342773438, + "loss": 0.5838, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -1.2820950746536255, + "rewards/margins": 0.4124404788017273, + "rewards/rejected": -1.6945356130599976, + "step": 15630 + }, + { + "epoch": 2.6946933149552033, + "grad_norm": 27.868053436279297, + "learning_rate": 3.1221713813976037e-09, + "logits/chosen": -2.2706925868988037, + "logits/rejected": -2.242724895477295, + "logps/chosen": -172.6728057861328, + "logps/rejected": -218.58621215820312, + "loss": 0.5448, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -1.1886308193206787, + "rewards/margins": 0.4725852608680725, + "rewards/rejected": -1.661216139793396, + "step": 15640 + }, + { + "epoch": 2.6964162646450722, + "grad_norm": 42.40397262573242, + "learning_rate": 3.0873980643088603e-09, + "logits/chosen": -2.2558560371398926, + "logits/rejected": -2.2352757453918457, + "logps/chosen": -177.53944396972656, + "logps/rejected": -205.47412109375, + "loss": 0.6291, + "rewards/accuracies": 0.65625, + "rewards/chosen": -1.2249729633331299, + "rewards/margins": 0.29740267992019653, + "rewards/rejected": -1.5223757028579712, + "step": 15650 + }, + { + "epoch": 2.698139214334941, + "grad_norm": 30.136716842651367, + "learning_rate": 3.052813307896801e-09, + "logits/chosen": -2.328580617904663, + "logits/rejected": -2.3151848316192627, + "logps/chosen": -178.67636108398438, + "logps/rejected": -210.3419189453125, + "loss": 0.5929, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -1.247610330581665, + "rewards/margins": 0.3233638405799866, + "rewards/rejected": -1.570974349975586, + "step": 15660 + }, + { + "epoch": 2.6998621640248106, + "grad_norm": 29.27895164489746, + "learning_rate": 3.018417251171529e-09, + "logits/chosen": -2.2305965423583984, + "logits/rejected": -2.19374942779541, + "logps/chosen": -174.97256469726562, + "logps/rejected": -208.78018188476562, + "loss": 0.5698, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -1.1976064443588257, + "rewards/margins": 0.3816137909889221, + "rewards/rejected": -1.5792200565338135, + "step": 15670 + }, + { + "epoch": 2.7015851137146796, + "grad_norm": 27.068044662475586, + "learning_rate": 2.984210032384671e-09, + "logits/chosen": -2.246372699737549, + "logits/rejected": -2.223546028137207, + "logps/chosen": -184.91104125976562, + "logps/rejected": -221.4980926513672, + "loss": 0.5878, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -1.3069775104522705, + "rewards/margins": 0.3822581470012665, + "rewards/rejected": -1.6892354488372803, + "step": 15680 + }, + { + "epoch": 2.7033080634045485, + "grad_norm": 23.017866134643555, + "learning_rate": 2.9501917890288387e-09, + "logits/chosen": -2.278564929962158, + "logits/rejected": -2.2606887817382812, + "logps/chosen": -173.19595336914062, + "logps/rejected": -208.59121704101562, + "loss": 0.5943, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -1.1975430250167847, + "rewards/margins": 0.3586288392543793, + "rewards/rejected": -1.5561718940734863, + "step": 15690 + }, + { + "epoch": 2.7050310130944175, + "grad_norm": 44.127220153808594, + "learning_rate": 2.9163626578370736e-09, + "logits/chosen": -2.2899715900421143, + "logits/rejected": -2.264925241470337, + "logps/chosen": -179.9619140625, + "logps/rejected": -217.8975830078125, + "loss": 0.5913, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -1.2661584615707397, + "rewards/margins": 0.39361196756362915, + "rewards/rejected": -1.6597706079483032, + "step": 15700 + }, + { + "epoch": 2.706753962784287, + "grad_norm": 28.553180694580078, + "learning_rate": 2.882722774782315e-09, + "logits/chosen": -2.2921769618988037, + "logits/rejected": -2.2634499073028564, + "logps/chosen": -185.9178924560547, + "logps/rejected": -224.8857421875, + "loss": 0.584, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -1.2706458568572998, + "rewards/margins": 0.4241026043891907, + "rewards/rejected": -1.6947485208511353, + "step": 15710 + }, + { + "epoch": 2.708476912474156, + "grad_norm": 47.15506362915039, + "learning_rate": 2.8492722750768305e-09, + "logits/chosen": -2.2799127101898193, + "logits/rejected": -2.272749185562134, + "logps/chosen": -183.07467651367188, + "logps/rejected": -203.03172302246094, + "loss": 0.6529, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -1.2899291515350342, + "rewards/margins": 0.2201959192752838, + "rewards/rejected": -1.510124921798706, + "step": 15720 + }, + { + "epoch": 2.710199862164025, + "grad_norm": 32.763004302978516, + "learning_rate": 2.8160112931716663e-09, + "logits/chosen": -2.347216844558716, + "logits/rejected": -2.3261611461639404, + "logps/chosen": -172.67808532714844, + "logps/rejected": -205.3860626220703, + "loss": 0.595, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -1.1863274574279785, + "rewards/margins": 0.34425634145736694, + "rewards/rejected": -1.5305836200714111, + "step": 15730 + }, + { + "epoch": 2.711922811853894, + "grad_norm": 32.519248962402344, + "learning_rate": 2.782939962756126e-09, + "logits/chosen": -2.2975738048553467, + "logits/rejected": -2.2604293823242188, + "logps/chosen": -184.21490478515625, + "logps/rejected": -211.71826171875, + "loss": 0.6261, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -1.2909154891967773, + "rewards/margins": 0.33306872844696045, + "rewards/rejected": -1.6239840984344482, + "step": 15740 + }, + { + "epoch": 2.713645761543763, + "grad_norm": 49.53424072265625, + "learning_rate": 2.750058416757245e-09, + "logits/chosen": -2.2987732887268066, + "logits/rejected": -2.2767536640167236, + "logps/chosen": -185.88815307617188, + "logps/rejected": -221.35122680664062, + "loss": 0.5931, + "rewards/accuracies": 0.71875, + "rewards/chosen": -1.3290163278579712, + "rewards/margins": 0.3406520187854767, + "rewards/rejected": -1.669668197631836, + "step": 15750 + }, + { + "epoch": 2.7153687112336318, + "grad_norm": 36.956336975097656, + "learning_rate": 2.717366787339209e-09, + "logits/chosen": -2.2139859199523926, + "logits/rejected": -2.1922428607940674, + "logps/chosen": -177.16494750976562, + "logps/rejected": -205.7443389892578, + "loss": 0.6176, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -1.2540347576141357, + "rewards/margins": 0.30845338106155396, + "rewards/rejected": -1.562488317489624, + "step": 15760 + }, + { + "epoch": 2.717091660923501, + "grad_norm": 31.984783172607422, + "learning_rate": 2.684865205902881e-09, + "logits/chosen": -2.272738456726074, + "logits/rejected": -2.2462387084960938, + "logps/chosen": -166.43064880371094, + "logps/rejected": -212.916259765625, + "loss": 0.5243, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -1.1281863451004028, + "rewards/margins": 0.47644680738449097, + "rewards/rejected": -1.6046329736709595, + "step": 15770 + }, + { + "epoch": 2.71881461061337, + "grad_norm": 33.240516662597656, + "learning_rate": 2.6525538030852223e-09, + "logits/chosen": -2.37518048286438, + "logits/rejected": -2.356550931930542, + "logps/chosen": -184.48910522460938, + "logps/rejected": -201.74148559570312, + "loss": 0.6542, + "rewards/accuracies": 0.581250011920929, + "rewards/chosen": -1.2998746633529663, + "rewards/margins": 0.1974526196718216, + "rewards/rejected": -1.497327208518982, + "step": 15780 + }, + { + "epoch": 2.720537560303239, + "grad_norm": 50.996089935302734, + "learning_rate": 2.620432708758802e-09, + "logits/chosen": -2.2115371227264404, + "logits/rejected": -2.197854995727539, + "logps/chosen": -183.6704864501953, + "logps/rejected": -209.2452392578125, + "loss": 0.6137, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.2914540767669678, + "rewards/margins": 0.2838708460330963, + "rewards/rejected": -1.5753250122070312, + "step": 15790 + }, + { + "epoch": 2.722260509993108, + "grad_norm": 33.28852081298828, + "learning_rate": 2.5885020520312604e-09, + "logits/chosen": -2.345170497894287, + "logits/rejected": -2.3010005950927734, + "logps/chosen": -177.3496856689453, + "logps/rejected": -206.03549194335938, + "loss": 0.5765, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -1.17936110496521, + "rewards/margins": 0.3781944811344147, + "rewards/rejected": -1.5575557947158813, + "step": 15800 + }, + { + "epoch": 2.7239834596829775, + "grad_norm": 40.61261749267578, + "learning_rate": 2.5567619612447854e-09, + "logits/chosen": -2.3195712566375732, + "logits/rejected": -2.309903621673584, + "logps/chosen": -178.634033203125, + "logps/rejected": -210.8284912109375, + "loss": 0.6173, + "rewards/accuracies": 0.65625, + "rewards/chosen": -1.29694402217865, + "rewards/margins": 0.29339122772216797, + "rewards/rejected": -1.5903352499008179, + "step": 15810 + }, + { + "epoch": 2.7257064093728465, + "grad_norm": 34.352272033691406, + "learning_rate": 2.5252125639756207e-09, + "logits/chosen": -2.225785255432129, + "logits/rejected": -2.202213764190674, + "logps/chosen": -179.4593048095703, + "logps/rejected": -213.7439422607422, + "loss": 0.6017, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -1.2712770700454712, + "rewards/margins": 0.35494905710220337, + "rewards/rejected": -1.6262260675430298, + "step": 15820 + }, + { + "epoch": 2.7274293590627154, + "grad_norm": 45.09383773803711, + "learning_rate": 2.493853987033523e-09, + "logits/chosen": -2.314626455307007, + "logits/rejected": -2.3013596534729004, + "logps/chosen": -173.50193786621094, + "logps/rejected": -207.23251342773438, + "loss": 0.6048, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -1.2176988124847412, + "rewards/margins": 0.3500228226184845, + "rewards/rejected": -1.5677217245101929, + "step": 15830 + }, + { + "epoch": 2.7291523087525844, + "grad_norm": 32.357215881347656, + "learning_rate": 2.4626863564612467e-09, + "logits/chosen": -2.3258578777313232, + "logits/rejected": -2.3102779388427734, + "logps/chosen": -193.52291870117188, + "logps/rejected": -226.4375457763672, + "loss": 0.6193, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -1.3908226490020752, + "rewards/margins": 0.32600319385528564, + "rewards/rejected": -1.71682608127594, + "step": 15840 + }, + { + "epoch": 2.7308752584424534, + "grad_norm": 41.314552307128906, + "learning_rate": 2.4317097975340985e-09, + "logits/chosen": -2.301025390625, + "logits/rejected": -2.2786521911621094, + "logps/chosen": -179.1788330078125, + "logps/rejected": -206.5927276611328, + "loss": 0.6233, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -1.2470613718032837, + "rewards/margins": 0.2841149866580963, + "rewards/rejected": -1.5311763286590576, + "step": 15850 + }, + { + "epoch": 2.7325982081323223, + "grad_norm": 72.44612884521484, + "learning_rate": 2.4009244347593604e-09, + "logits/chosen": -2.2927424907684326, + "logits/rejected": -2.264702320098877, + "logps/chosen": -178.3214569091797, + "logps/rejected": -201.4319305419922, + "loss": 0.6281, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -1.240483283996582, + "rewards/margins": 0.2728969156742096, + "rewards/rejected": -1.5133801698684692, + "step": 15860 + }, + { + "epoch": 2.7343211578221913, + "grad_norm": 33.07346725463867, + "learning_rate": 2.370330391875819e-09, + "logits/chosen": -2.2673001289367676, + "logits/rejected": -2.2333619594573975, + "logps/chosen": -184.83200073242188, + "logps/rejected": -225.6978759765625, + "loss": 0.566, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -1.290633201599121, + "rewards/margins": 0.44913339614868164, + "rewards/rejected": -1.7397664785385132, + "step": 15870 + }, + { + "epoch": 2.7360441075120607, + "grad_norm": 33.767364501953125, + "learning_rate": 2.3399277918532854e-09, + "logits/chosen": -2.2728636264801025, + "logits/rejected": -2.257136821746826, + "logps/chosen": -179.16775512695312, + "logps/rejected": -213.8397979736328, + "loss": 0.5858, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -1.2510485649108887, + "rewards/margins": 0.3816913962364197, + "rewards/rejected": -1.6327400207519531, + "step": 15880 + }, + { + "epoch": 2.7377670572019297, + "grad_norm": 47.875667572021484, + "learning_rate": 2.309716756892083e-09, + "logits/chosen": -2.3037915229797363, + "logits/rejected": -2.2675938606262207, + "logps/chosen": -176.85299682617188, + "logps/rejected": -209.14602661132812, + "loss": 0.5899, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -1.2395904064178467, + "rewards/margins": 0.3721577823162079, + "rewards/rejected": -1.6117480993270874, + "step": 15890 + }, + { + "epoch": 2.7394900068917987, + "grad_norm": 49.56647872924805, + "learning_rate": 2.2796974084225373e-09, + "logits/chosen": -2.299668550491333, + "logits/rejected": -2.2523045539855957, + "logps/chosen": -190.56446838378906, + "logps/rejected": -217.8603973388672, + "loss": 0.5895, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -1.2909893989562988, + "rewards/margins": 0.37662652134895325, + "rewards/rejected": -1.6676161289215088, + "step": 15900 + }, + { + "epoch": 2.741212956581668, + "grad_norm": 38.5182991027832, + "learning_rate": 2.249869867104537e-09, + "logits/chosen": -2.2339723110198975, + "logits/rejected": -2.208400249481201, + "logps/chosen": -172.8338623046875, + "logps/rejected": -198.0580291748047, + "loss": 0.6159, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -1.183179497718811, + "rewards/margins": 0.30717626214027405, + "rewards/rejected": -1.4903557300567627, + "step": 15910 + }, + { + "epoch": 2.742935906271537, + "grad_norm": 35.76197814941406, + "learning_rate": 2.220234252826991e-09, + "logits/chosen": -2.2531018257141113, + "logits/rejected": -2.2350258827209473, + "logps/chosen": -184.27220153808594, + "logps/rejected": -216.9780731201172, + "loss": 0.6113, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -1.3089081048965454, + "rewards/margins": 0.32555288076400757, + "rewards/rejected": -1.6344608068466187, + "step": 15920 + }, + { + "epoch": 2.744658855961406, + "grad_norm": 37.21056365966797, + "learning_rate": 2.190790684707411e-09, + "logits/chosen": -2.211575984954834, + "logits/rejected": -2.1774814128875732, + "logps/chosen": -173.06137084960938, + "logps/rejected": -198.30435180664062, + "loss": 0.5968, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -1.1646642684936523, + "rewards/margins": 0.32496175169944763, + "rewards/rejected": -1.4896259307861328, + "step": 15930 + }, + { + "epoch": 2.746381805651275, + "grad_norm": 32.13726806640625, + "learning_rate": 2.161539281091351e-09, + "logits/chosen": -2.264610528945923, + "logits/rejected": -2.2332422733306885, + "logps/chosen": -180.72996520996094, + "logps/rejected": -229.2228546142578, + "loss": 0.5391, + "rewards/accuracies": 0.768750011920929, + "rewards/chosen": -1.202547311782837, + "rewards/margins": 0.5045925378799438, + "rewards/rejected": -1.7071399688720703, + "step": 15940 + }, + { + "epoch": 2.748104755341144, + "grad_norm": 48.947052001953125, + "learning_rate": 2.1324801595520357e-09, + "logits/chosen": -2.3333230018615723, + "logits/rejected": -2.304076671600342, + "logps/chosen": -178.30613708496094, + "logps/rejected": -203.87501525878906, + "loss": 0.6035, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -1.189021348953247, + "rewards/margins": 0.333077996969223, + "rewards/rejected": -1.522099256515503, + "step": 15950 + }, + { + "epoch": 2.749827705031013, + "grad_norm": 33.582252502441406, + "learning_rate": 2.1036134368897785e-09, + "logits/chosen": -2.309985876083374, + "logits/rejected": -2.283900260925293, + "logps/chosen": -183.0913543701172, + "logps/rejected": -211.28482055664062, + "loss": 0.6135, + "rewards/accuracies": 0.65625, + "rewards/chosen": -1.2604248523712158, + "rewards/margins": 0.33174318075180054, + "rewards/rejected": -1.592167854309082, + "step": 15960 + }, + { + "epoch": 2.751550654720882, + "grad_norm": 33.72100830078125, + "learning_rate": 2.0749392291315894e-09, + "logits/chosen": -2.291177988052368, + "logits/rejected": -2.267993211746216, + "logps/chosen": -182.629638671875, + "logps/rejected": -216.76400756835938, + "loss": 0.5853, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -1.2580294609069824, + "rewards/margins": 0.3601645827293396, + "rewards/rejected": -1.6181939840316772, + "step": 15970 + }, + { + "epoch": 2.7532736044107513, + "grad_norm": 38.948707580566406, + "learning_rate": 2.046457651530686e-09, + "logits/chosen": -2.2676291465759277, + "logits/rejected": -2.246793270111084, + "logps/chosen": -181.0098114013672, + "logps/rejected": -211.0522003173828, + "loss": 0.6109, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -1.2406721115112305, + "rewards/margins": 0.3142922520637512, + "rewards/rejected": -1.554964303970337, + "step": 15980 + }, + { + "epoch": 2.7549965541006203, + "grad_norm": 41.575164794921875, + "learning_rate": 2.0181688185660183e-09, + "logits/chosen": -2.370026111602783, + "logits/rejected": -2.3678243160247803, + "logps/chosen": -175.36595153808594, + "logps/rejected": -205.42568969726562, + "loss": 0.6076, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -1.1957898139953613, + "rewards/margins": 0.3089122772216797, + "rewards/rejected": -1.5047019720077515, + "step": 15990 + }, + { + "epoch": 2.7567195037904892, + "grad_norm": 32.23805236816406, + "learning_rate": 1.99007284394182e-09, + "logits/chosen": -2.247300624847412, + "logits/rejected": -2.2156033515930176, + "logps/chosen": -181.68435668945312, + "logps/rejected": -212.2035369873047, + "loss": 0.5983, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -1.27165949344635, + "rewards/margins": 0.34139880537986755, + "rewards/rejected": -1.61305832862854, + "step": 16000 + }, + { + "epoch": 2.7567195037904892, + "eval_logits/chosen": -2.3530187606811523, + "eval_logits/rejected": -2.34079647064209, + "eval_logps/chosen": -166.0839080810547, + "eval_logps/rejected": -187.39894104003906, + "eval_loss": 0.6500682234764099, + "eval_rewards/accuracies": 0.6029275059700012, + "eval_rewards/chosen": -1.0706843137741089, + "eval_rewards/margins": 0.1758090853691101, + "eval_rewards/rejected": -1.2464934587478638, + "eval_runtime": 384.7363, + "eval_samples_per_second": 11.187, + "eval_steps_per_second": 1.398, + "step": 16000 + }, + { + "epoch": 2.758442453480358, + "grad_norm": 39.961814880371094, + "learning_rate": 1.9621698405871466e-09, + "logits/chosen": -2.3224873542785645, + "logits/rejected": -2.309032440185547, + "logps/chosen": -184.5162811279297, + "logps/rejected": -210.5989990234375, + "loss": 0.6341, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -1.2852531671524048, + "rewards/margins": 0.2957290709018707, + "rewards/rejected": -1.580981969833374, + "step": 16010 + }, + { + "epoch": 2.7601654031702276, + "grad_norm": 33.21334457397461, + "learning_rate": 1.934459920655429e-09, + "logits/chosen": -2.356708288192749, + "logits/rejected": -2.330230236053467, + "logps/chosen": -181.0506591796875, + "logps/rejected": -212.9567413330078, + "loss": 0.6103, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -1.2669428586959839, + "rewards/margins": 0.3563677966594696, + "rewards/rejected": -1.6233106851577759, + "step": 16020 + }, + { + "epoch": 2.7618883528600966, + "grad_norm": 38.95966339111328, + "learning_rate": 1.90694319552403e-09, + "logits/chosen": -2.3382115364074707, + "logits/rejected": -2.3172762393951416, + "logps/chosen": -179.08792114257812, + "logps/rejected": -211.1810302734375, + "loss": 0.6094, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -1.2544938325881958, + "rewards/margins": 0.3421923518180847, + "rewards/rejected": -1.5966860055923462, + "step": 16030 + }, + { + "epoch": 2.7636113025499656, + "grad_norm": 47.20829391479492, + "learning_rate": 1.879619775793756e-09, + "logits/chosen": -2.3078360557556152, + "logits/rejected": -2.286541223526001, + "logps/chosen": -181.668212890625, + "logps/rejected": -212.9115753173828, + "loss": 0.6164, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -1.2616897821426392, + "rewards/margins": 0.35222771763801575, + "rewards/rejected": -1.613917589187622, + "step": 16040 + }, + { + "epoch": 2.7653342522398345, + "grad_norm": 31.04566764831543, + "learning_rate": 1.8524897712884514e-09, + "logits/chosen": -2.279730796813965, + "logits/rejected": -2.2557711601257324, + "logps/chosen": -180.85476684570312, + "logps/rejected": -219.46853637695312, + "loss": 0.5784, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.2566825151443481, + "rewards/margins": 0.4106810986995697, + "rewards/rejected": -1.6673635244369507, + "step": 16050 + }, + { + "epoch": 2.7670572019297035, + "grad_norm": 34.17963790893555, + "learning_rate": 1.8255532910545657e-09, + "logits/chosen": -2.295279026031494, + "logits/rejected": -2.276061773300171, + "logps/chosen": -175.43310546875, + "logps/rejected": -205.7451629638672, + "loss": 0.5895, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.1868820190429688, + "rewards/margins": 0.3304034173488617, + "rewards/rejected": -1.5172855854034424, + "step": 16060 + }, + { + "epoch": 2.7687801516195725, + "grad_norm": 38.8708610534668, + "learning_rate": 1.798810443360671e-09, + "logits/chosen": -2.2456247806549072, + "logits/rejected": -2.221320390701294, + "logps/chosen": -184.22183227539062, + "logps/rejected": -217.4356231689453, + "loss": 0.575, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -1.2606128454208374, + "rewards/margins": 0.3818413019180298, + "rewards/rejected": -1.6424541473388672, + "step": 16070 + }, + { + "epoch": 2.770503101309442, + "grad_norm": 32.86567306518555, + "learning_rate": 1.7722613356970728e-09, + "logits/chosen": -2.2950329780578613, + "logits/rejected": -2.2494683265686035, + "logps/chosen": -183.48300170898438, + "logps/rejected": -220.6748504638672, + "loss": 0.5709, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -1.2529817819595337, + "rewards/margins": 0.4442780613899231, + "rewards/rejected": -1.6972599029541016, + "step": 16080 + }, + { + "epoch": 2.772226050999311, + "grad_norm": 30.092023849487305, + "learning_rate": 1.745906074775344e-09, + "logits/chosen": -2.2925052642822266, + "logits/rejected": -2.263424873352051, + "logps/chosen": -170.57962036132812, + "logps/rejected": -206.346435546875, + "loss": 0.5744, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -1.1700583696365356, + "rewards/margins": 0.38973119854927063, + "rewards/rejected": -1.5597896575927734, + "step": 16090 + }, + { + "epoch": 2.77394900068918, + "grad_norm": 34.294044494628906, + "learning_rate": 1.7197447665279142e-09, + "logits/chosen": -2.3059916496276855, + "logits/rejected": -2.288022518157959, + "logps/chosen": -178.9658203125, + "logps/rejected": -226.75601196289062, + "loss": 0.5548, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -1.2523266077041626, + "rewards/margins": 0.47826337814331055, + "rewards/rejected": -1.7305901050567627, + "step": 16100 + }, + { + "epoch": 2.775671950379049, + "grad_norm": 46.52742385864258, + "learning_rate": 1.6937775161076251e-09, + "logits/chosen": -2.220905303955078, + "logits/rejected": -2.183434009552002, + "logps/chosen": -176.07647705078125, + "logps/rejected": -209.58218383789062, + "loss": 0.5872, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.1807199716567993, + "rewards/margins": 0.3810442090034485, + "rewards/rejected": -1.561764121055603, + "step": 16110 + }, + { + "epoch": 2.777394900068918, + "grad_norm": 44.02835464477539, + "learning_rate": 1.6680044278873428e-09, + "logits/chosen": -2.2952685356140137, + "logits/rejected": -2.279714345932007, + "logps/chosen": -170.16844177246094, + "logps/rejected": -202.47323608398438, + "loss": 0.6085, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -1.1918901205062866, + "rewards/margins": 0.31227028369903564, + "rewards/rejected": -1.5041604042053223, + "step": 16120 + }, + { + "epoch": 2.779117849758787, + "grad_norm": 48.453269958496094, + "learning_rate": 1.6424256054595187e-09, + "logits/chosen": -2.2706055641174316, + "logits/rejected": -2.2444052696228027, + "logps/chosen": -183.84979248046875, + "logps/rejected": -212.5630340576172, + "loss": 0.6162, + "rewards/accuracies": 0.65625, + "rewards/chosen": -1.3051106929779053, + "rewards/margins": 0.3154757618904114, + "rewards/rejected": -1.6205863952636719, + "step": 16130 + }, + { + "epoch": 2.780840799448656, + "grad_norm": 40.92231750488281, + "learning_rate": 1.6170411516357563e-09, + "logits/chosen": -2.348477363586426, + "logits/rejected": -2.3189752101898193, + "logps/chosen": -180.53762817382812, + "logps/rejected": -211.91152954101562, + "loss": 0.5974, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -1.2678172588348389, + "rewards/margins": 0.3369593024253845, + "rewards/rejected": -1.604776382446289, + "step": 16140 + }, + { + "epoch": 2.782563749138525, + "grad_norm": 43.04970932006836, + "learning_rate": 1.5918511684464008e-09, + "logits/chosen": -2.323899030685425, + "logits/rejected": -2.3004534244537354, + "logps/chosen": -183.69158935546875, + "logps/rejected": -213.60726928710938, + "loss": 0.6019, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -1.2804274559020996, + "rewards/margins": 0.34259334206581116, + "rewards/rejected": -1.6230207681655884, + "step": 16150 + }, + { + "epoch": 2.784286698828394, + "grad_norm": 34.49961471557617, + "learning_rate": 1.5668557571401786e-09, + "logits/chosen": -2.3167967796325684, + "logits/rejected": -2.284923791885376, + "logps/chosen": -173.17575073242188, + "logps/rejected": -217.793701171875, + "loss": 0.5502, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": -1.1839545965194702, + "rewards/margins": 0.46423736214637756, + "rewards/rejected": -1.648192048072815, + "step": 16160 + }, + { + "epoch": 2.786009648518263, + "grad_norm": 25.138389587402344, + "learning_rate": 1.5420550181837245e-09, + "logits/chosen": -2.2304892539978027, + "logits/rejected": -2.2073214054107666, + "logps/chosen": -185.9856414794922, + "logps/rejected": -214.74386596679688, + "loss": 0.6033, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -1.314554214477539, + "rewards/margins": 0.3014872372150421, + "rewards/rejected": -1.6160414218902588, + "step": 16170 + }, + { + "epoch": 2.7877325982081325, + "grad_norm": 38.96569061279297, + "learning_rate": 1.517449051261227e-09, + "logits/chosen": -2.2721707820892334, + "logits/rejected": -2.2256643772125244, + "logps/chosen": -189.69200134277344, + "logps/rejected": -223.68936157226562, + "loss": 0.5777, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -1.297239899635315, + "rewards/margins": 0.4274531304836273, + "rewards/rejected": -1.7246930599212646, + "step": 16180 + }, + { + "epoch": 2.7894555478980014, + "grad_norm": 50.26418685913086, + "learning_rate": 1.4930379552739791e-09, + "logits/chosen": -2.2861552238464355, + "logits/rejected": -2.2630105018615723, + "logps/chosen": -181.56906127929688, + "logps/rejected": -213.3578643798828, + "loss": 0.6124, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -1.2681522369384766, + "rewards/margins": 0.3266271948814392, + "rewards/rejected": -1.594779372215271, + "step": 16190 + }, + { + "epoch": 2.7911784975878704, + "grad_norm": 39.742286682128906, + "learning_rate": 1.4688218283400334e-09, + "logits/chosen": -2.2232251167297363, + "logits/rejected": -2.1820454597473145, + "logps/chosen": -180.40234375, + "logps/rejected": -209.013671875, + "loss": 0.6131, + "rewards/accuracies": 0.6187499761581421, + "rewards/chosen": -1.2425382137298584, + "rewards/margins": 0.3541840612888336, + "rewards/rejected": -1.5967223644256592, + "step": 16200 + }, + { + "epoch": 2.7929014472777394, + "grad_norm": 33.191959381103516, + "learning_rate": 1.4448007677937746e-09, + "logits/chosen": -2.1963438987731934, + "logits/rejected": -2.1801505088806152, + "logps/chosen": -180.97879028320312, + "logps/rejected": -206.3859405517578, + "loss": 0.6406, + "rewards/accuracies": 0.6187499761581421, + "rewards/chosen": -1.2748857736587524, + "rewards/margins": 0.2578073740005493, + "rewards/rejected": -1.5326931476593018, + "step": 16210 + }, + { + "epoch": 2.794624396967609, + "grad_norm": 40.69626235961914, + "learning_rate": 1.420974870185543e-09, + "logits/chosen": -2.2687435150146484, + "logits/rejected": -2.2401299476623535, + "logps/chosen": -174.5592041015625, + "logps/rejected": -214.38064575195312, + "loss": 0.5715, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -1.1669092178344727, + "rewards/margins": 0.43398723006248474, + "rewards/rejected": -1.6008965969085693, + "step": 16220 + }, + { + "epoch": 2.7963473466574778, + "grad_norm": 32.2374267578125, + "learning_rate": 1.3973442312812278e-09, + "logits/chosen": -2.289696216583252, + "logits/rejected": -2.2658562660217285, + "logps/chosen": -178.87796020507812, + "logps/rejected": -209.24978637695312, + "loss": 0.6037, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -1.2666771411895752, + "rewards/margins": 0.32003122568130493, + "rewards/rejected": -1.5867083072662354, + "step": 16230 + }, + { + "epoch": 2.7980702963473467, + "grad_norm": 37.53076171875, + "learning_rate": 1.373908946061908e-09, + "logits/chosen": -2.2701830863952637, + "logits/rejected": -2.248612880706787, + "logps/chosen": -183.1614227294922, + "logps/rejected": -212.554443359375, + "loss": 0.6157, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -1.298271656036377, + "rewards/margins": 0.3004020154476166, + "rewards/rejected": -1.598673701286316, + "step": 16240 + }, + { + "epoch": 2.7997932460372157, + "grad_norm": 36.98796463012695, + "learning_rate": 1.3506691087234457e-09, + "logits/chosen": -2.2960731983184814, + "logits/rejected": -2.2753283977508545, + "logps/chosen": -180.3482208251953, + "logps/rejected": -206.80184936523438, + "loss": 0.6087, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -1.2228682041168213, + "rewards/margins": 0.305896133184433, + "rewards/rejected": -1.528764009475708, + "step": 16250 + }, + { + "epoch": 2.8015161957270847, + "grad_norm": 47.55896759033203, + "learning_rate": 1.3276248126761259e-09, + "logits/chosen": -2.297947883605957, + "logits/rejected": -2.271911144256592, + "logps/chosen": -182.63546752929688, + "logps/rejected": -222.87747192382812, + "loss": 0.606, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -1.2915923595428467, + "rewards/margins": 0.38956117630004883, + "rewards/rejected": -1.6811535358428955, + "step": 16260 + }, + { + "epoch": 2.8032391454169536, + "grad_norm": 49.676631927490234, + "learning_rate": 1.304776150544279e-09, + "logits/chosen": -2.3071579933166504, + "logits/rejected": -2.2782936096191406, + "logps/chosen": -178.2207794189453, + "logps/rejected": -212.48013305664062, + "loss": 0.5938, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -1.22140371799469, + "rewards/margins": 0.3800586760044098, + "rewards/rejected": -1.6014623641967773, + "step": 16270 + }, + { + "epoch": 2.804962095106823, + "grad_norm": 34.81820297241211, + "learning_rate": 1.2821232141658866e-09, + "logits/chosen": -2.35308837890625, + "logits/rejected": -2.3323071002960205, + "logps/chosen": -188.4277801513672, + "logps/rejected": -214.10342407226562, + "loss": 0.612, + "rewards/accuracies": 0.65625, + "rewards/chosen": -1.313417673110962, + "rewards/margins": 0.30872249603271484, + "rewards/rejected": -1.6221401691436768, + "step": 16280 + }, + { + "epoch": 2.806685044796692, + "grad_norm": 32.014060974121094, + "learning_rate": 1.2596660945922433e-09, + "logits/chosen": -2.307892084121704, + "logits/rejected": -2.2928123474121094, + "logps/chosen": -175.9683074951172, + "logps/rejected": -210.37765502929688, + "loss": 0.6093, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -1.2256677150726318, + "rewards/margins": 0.3405986428260803, + "rewards/rejected": -1.5662662982940674, + "step": 16290 + }, + { + "epoch": 2.808407994486561, + "grad_norm": 35.85638427734375, + "learning_rate": 1.2374048820875893e-09, + "logits/chosen": -2.3248770236968994, + "logits/rejected": -2.2926807403564453, + "logps/chosen": -183.6764678955078, + "logps/rejected": -212.8525390625, + "loss": 0.6077, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -1.287197232246399, + "rewards/margins": 0.33820948004722595, + "rewards/rejected": -1.6254066228866577, + "step": 16300 + }, + { + "epoch": 2.81013094417643, + "grad_norm": 28.929685592651367, + "learning_rate": 1.2153396661287007e-09, + "logits/chosen": -2.282907485961914, + "logits/rejected": -2.2625086307525635, + "logps/chosen": -181.51144409179688, + "logps/rejected": -209.85305786132812, + "loss": 0.6353, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -1.277237892150879, + "rewards/margins": 0.27741461992263794, + "rewards/rejected": -1.5546525716781616, + "step": 16310 + }, + { + "epoch": 2.8118538938662994, + "grad_norm": 38.36284255981445, + "learning_rate": 1.1934705354045894e-09, + "logits/chosen": -2.292874813079834, + "logits/rejected": -2.273878574371338, + "logps/chosen": -185.80093383789062, + "logps/rejected": -218.79623413085938, + "loss": 0.6061, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -1.2972584962844849, + "rewards/margins": 0.3475348949432373, + "rewards/rejected": -1.6447932720184326, + "step": 16320 + }, + { + "epoch": 2.8135768435561683, + "grad_norm": 28.26596450805664, + "learning_rate": 1.1717975778161193e-09, + "logits/chosen": -2.3105838298797607, + "logits/rejected": -2.2864773273468018, + "logps/chosen": -176.97756958007812, + "logps/rejected": -203.34059143066406, + "loss": 0.6141, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -1.1774392127990723, + "rewards/margins": 0.32404693961143494, + "rewards/rejected": -1.5014861822128296, + "step": 16330 + }, + { + "epoch": 2.8152997932460373, + "grad_norm": 29.667314529418945, + "learning_rate": 1.1503208804756526e-09, + "logits/chosen": -2.237299919128418, + "logits/rejected": -2.2079734802246094, + "logps/chosen": -175.31661987304688, + "logps/rejected": -215.7568817138672, + "loss": 0.5493, + "rewards/accuracies": 0.71875, + "rewards/chosen": -1.1911835670471191, + "rewards/margins": 0.452349990606308, + "rewards/rejected": -1.643533706665039, + "step": 16340 + }, + { + "epoch": 2.8170227429359063, + "grad_norm": 33.90262222290039, + "learning_rate": 1.1290405297066984e-09, + "logits/chosen": -2.371725559234619, + "logits/rejected": -2.335756778717041, + "logps/chosen": -177.57327270507812, + "logps/rejected": -211.3468017578125, + "loss": 0.5726, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -1.2037913799285889, + "rewards/margins": 0.3730456829071045, + "rewards/rejected": -1.5768373012542725, + "step": 16350 + }, + { + "epoch": 2.8187456926257752, + "grad_norm": 41.114654541015625, + "learning_rate": 1.1079566110435812e-09, + "logits/chosen": -2.2963128089904785, + "logits/rejected": -2.2677197456359863, + "logps/chosen": -175.20028686523438, + "logps/rejected": -209.4230194091797, + "loss": 0.5819, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -1.2134548425674438, + "rewards/margins": 0.3761741518974304, + "rewards/rejected": -1.589629054069519, + "step": 16360 + }, + { + "epoch": 2.820468642315644, + "grad_norm": 38.21614074707031, + "learning_rate": 1.0870692092310674e-09, + "logits/chosen": -2.2704720497131348, + "logits/rejected": -2.2500154972076416, + "logps/chosen": -181.661865234375, + "logps/rejected": -203.2222900390625, + "loss": 0.6249, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -1.2675329446792603, + "rewards/margins": 0.27036967873573303, + "rewards/rejected": -1.537902593612671, + "step": 16370 + }, + { + "epoch": 2.822191592005513, + "grad_norm": 34.16209411621094, + "learning_rate": 1.0663784082240556e-09, + "logits/chosen": -2.3108277320861816, + "logits/rejected": -2.2971932888031006, + "logps/chosen": -180.47698974609375, + "logps/rejected": -210.71932983398438, + "loss": 0.6062, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -1.2505805492401123, + "rewards/margins": 0.31283727288246155, + "rewards/rejected": -1.5634177923202515, + "step": 16380 + }, + { + "epoch": 2.8239145416953826, + "grad_norm": 33.69387435913086, + "learning_rate": 1.0458842911872213e-09, + "logits/chosen": -2.2244887351989746, + "logits/rejected": -2.2039520740509033, + "logps/chosen": -172.71177673339844, + "logps/rejected": -205.65087890625, + "loss": 0.607, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -1.1868815422058105, + "rewards/margins": 0.3532125651836395, + "rewards/rejected": -1.5400941371917725, + "step": 16390 + }, + { + "epoch": 2.8256374913852516, + "grad_norm": 38.37247848510742, + "learning_rate": 1.0255869404947049e-09, + "logits/chosen": -2.233137845993042, + "logits/rejected": -2.213040828704834, + "logps/chosen": -178.09384155273438, + "logps/rejected": -209.20669555664062, + "loss": 0.5956, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -1.2155065536499023, + "rewards/margins": 0.33726614713668823, + "rewards/rejected": -1.5527725219726562, + "step": 16400 + }, + { + "epoch": 2.8256374913852516, + "eval_logits/chosen": -2.3550028800964355, + "eval_logits/rejected": -2.34291672706604, + "eval_logps/chosen": -164.95201110839844, + "eval_logps/rejected": -186.080322265625, + "eval_loss": 0.650005042552948, + "eval_rewards/accuracies": 0.600836455821991, + "eval_rewards/chosen": -1.0593652725219727, + "eval_rewards/margins": 0.17394186556339264, + "eval_rewards/rejected": -1.2333072423934937, + "eval_runtime": 385.0643, + "eval_samples_per_second": 11.177, + "eval_steps_per_second": 1.397, + "step": 16400 + }, + { + "epoch": 2.8273604410751205, + "grad_norm": 32.81662368774414, + "learning_rate": 1.0054864377297357e-09, + "logits/chosen": -2.291510820388794, + "logits/rejected": -2.250988245010376, + "logps/chosen": -182.9836883544922, + "logps/rejected": -213.5102081298828, + "loss": 0.5815, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -1.2685143947601318, + "rewards/margins": 0.3915003836154938, + "rewards/rejected": -1.6600148677825928, + "step": 16410 + }, + { + "epoch": 2.82908339076499, + "grad_norm": 47.90050506591797, + "learning_rate": 9.855828636843422e-10, + "logits/chosen": -2.2053303718566895, + "logits/rejected": -2.1868436336517334, + "logps/chosen": -179.76307678222656, + "logps/rejected": -211.9607391357422, + "loss": 0.6076, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -1.2450826168060303, + "rewards/margins": 0.3121925890445709, + "rewards/rejected": -1.5572751760482788, + "step": 16420 + }, + { + "epoch": 2.830806340454859, + "grad_norm": 41.66067886352539, + "learning_rate": 9.65876298359025e-10, + "logits/chosen": -2.2113218307495117, + "logits/rejected": -2.198329448699951, + "logps/chosen": -182.64976501464844, + "logps/rejected": -212.186767578125, + "loss": 0.6203, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -1.2813270092010498, + "rewards/margins": 0.3096092939376831, + "rewards/rejected": -1.5909364223480225, + "step": 16430 + }, + { + "epoch": 2.832529290144728, + "grad_norm": 33.65230941772461, + "learning_rate": 9.463668209624298e-10, + "logits/chosen": -2.2552592754364014, + "logits/rejected": -2.2291042804718018, + "logps/chosen": -178.43710327148438, + "logps/rejected": -214.56546020507812, + "loss": 0.5943, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -1.2642335891723633, + "rewards/margins": 0.3713500201702118, + "rewards/rejected": -1.6355836391448975, + "step": 16440 + }, + { + "epoch": 2.834252239834597, + "grad_norm": 39.56161880493164, + "learning_rate": 9.270545099110072e-10, + "logits/chosen": -2.338459014892578, + "logits/rejected": -2.306795597076416, + "logps/chosen": -178.9355926513672, + "logps/rejected": -217.2798614501953, + "loss": 0.5548, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -1.2057013511657715, + "rewards/margins": 0.4274584650993347, + "rewards/rejected": -1.633159875869751, + "step": 16450 + }, + { + "epoch": 2.835975189524466, + "grad_norm": 41.27180099487305, + "learning_rate": 9.079394428287312e-10, + "logits/chosen": -2.1934924125671387, + "logits/rejected": -2.1793243885040283, + "logps/chosen": -167.72926330566406, + "logps/rejected": -211.9979248046875, + "loss": 0.568, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.1811243295669556, + "rewards/margins": 0.4424045979976654, + "rewards/rejected": -1.6235288381576538, + "step": 16460 + }, + { + "epoch": 2.837698139214335, + "grad_norm": 39.97527313232422, + "learning_rate": 8.890216965467656e-10, + "logits/chosen": -2.308414936065674, + "logits/rejected": -2.282804489135742, + "logps/chosen": -175.1788330078125, + "logps/rejected": -217.9228057861328, + "loss": 0.5758, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.2298270463943481, + "rewards/margins": 0.43262115120887756, + "rewards/rejected": -1.6624482870101929, + "step": 16470 + }, + { + "epoch": 2.8394210889042037, + "grad_norm": 47.5313720703125, + "learning_rate": 8.70301347103175e-10, + "logits/chosen": -2.2051024436950684, + "logits/rejected": -2.183053493499756, + "logps/chosen": -177.04518127441406, + "logps/rejected": -213.59140014648438, + "loss": 0.5864, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -1.235124111175537, + "rewards/margins": 0.36766180396080017, + "rewards/rejected": -1.6027857065200806, + "step": 16480 + }, + { + "epoch": 2.841144038594073, + "grad_norm": 47.61405563354492, + "learning_rate": 8.517784697425978e-10, + "logits/chosen": -2.287238121032715, + "logits/rejected": -2.2698919773101807, + "logps/chosen": -176.14376831054688, + "logps/rejected": -194.83566284179688, + "loss": 0.6508, + "rewards/accuracies": 0.581250011920929, + "rewards/chosen": -1.2334768772125244, + "rewards/margins": 0.20505890250205994, + "rewards/rejected": -1.4385356903076172, + "step": 16490 + }, + { + "epoch": 2.842866988283942, + "grad_norm": 25.528099060058594, + "learning_rate": 8.334531389159349e-10, + "logits/chosen": -2.315481185913086, + "logits/rejected": -2.267341136932373, + "logps/chosen": -170.80941772460938, + "logps/rejected": -191.43600463867188, + "loss": 0.6026, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -1.1156132221221924, + "rewards/margins": 0.30887675285339355, + "rewards/rejected": -1.4244900941848755, + "step": 16500 + }, + { + "epoch": 2.844589937973811, + "grad_norm": 52.03614044189453, + "learning_rate": 8.153254282801114e-10, + "logits/chosen": -2.3164989948272705, + "logits/rejected": -2.295607328414917, + "logps/chosen": -177.18972778320312, + "logps/rejected": -206.91775512695312, + "loss": 0.6078, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.194132924079895, + "rewards/margins": 0.32859188318252563, + "rewards/rejected": -1.5227246284484863, + "step": 16510 + }, + { + "epoch": 2.84631288766368, + "grad_norm": 42.001678466796875, + "learning_rate": 7.973954106976876e-10, + "logits/chosen": -2.3597311973571777, + "logits/rejected": -2.324655771255493, + "logps/chosen": -184.51565551757812, + "logps/rejected": -224.277587890625, + "loss": 0.5683, + "rewards/accuracies": 0.71875, + "rewards/chosen": -1.2740962505340576, + "rewards/margins": 0.4469670355319977, + "rewards/rejected": -1.721063256263733, + "step": 16520 + }, + { + "epoch": 2.8480358373535495, + "grad_norm": 51.226280212402344, + "learning_rate": 7.796631582366486e-10, + "logits/chosen": -2.2431588172912598, + "logits/rejected": -2.224879026412964, + "logps/chosen": -178.93814086914062, + "logps/rejected": -197.71075439453125, + "loss": 0.6314, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -1.221017599105835, + "rewards/margins": 0.23768672347068787, + "rewards/rejected": -1.4587042331695557, + "step": 16530 + }, + { + "epoch": 2.8497587870434185, + "grad_norm": 34.07789993286133, + "learning_rate": 7.621287421700762e-10, + "logits/chosen": -2.267864227294922, + "logits/rejected": -2.2442805767059326, + "logps/chosen": -182.6355438232422, + "logps/rejected": -209.2755126953125, + "loss": 0.6169, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -1.291688323020935, + "rewards/margins": 0.3040887713432312, + "rewards/rejected": -1.5957772731781006, + "step": 16540 + }, + { + "epoch": 2.8514817367332874, + "grad_norm": 27.323087692260742, + "learning_rate": 7.447922329758605e-10, + "logits/chosen": -2.252119302749634, + "logits/rejected": -2.2273201942443848, + "logps/chosen": -180.32655334472656, + "logps/rejected": -219.2308349609375, + "loss": 0.5787, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -1.2568960189819336, + "rewards/margins": 0.40228986740112305, + "rewards/rejected": -1.6591860055923462, + "step": 16550 + }, + { + "epoch": 2.8532046864231564, + "grad_norm": 36.010589599609375, + "learning_rate": 7.276537003364225e-10, + "logits/chosen": -2.2651255130767822, + "logits/rejected": -2.240661144256592, + "logps/chosen": -180.301025390625, + "logps/rejected": -216.3168182373047, + "loss": 0.5982, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -1.248245358467102, + "rewards/margins": 0.3549201488494873, + "rewards/rejected": -1.603165626525879, + "step": 16560 + }, + { + "epoch": 2.8549276361130254, + "grad_norm": 34.78297424316406, + "learning_rate": 7.107132131384475e-10, + "logits/chosen": -2.330124855041504, + "logits/rejected": -2.2960896492004395, + "logps/chosen": -176.1905975341797, + "logps/rejected": -208.61962890625, + "loss": 0.5842, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -1.2155747413635254, + "rewards/margins": 0.363336980342865, + "rewards/rejected": -1.5789117813110352, + "step": 16570 + }, + { + "epoch": 2.8566505858028943, + "grad_norm": 40.64534378051758, + "learning_rate": 6.939708394725907e-10, + "logits/chosen": -2.247532367706299, + "logits/rejected": -2.2236592769622803, + "logps/chosen": -176.4193572998047, + "logps/rejected": -212.16561889648438, + "loss": 0.5932, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.2451902627944946, + "rewards/margins": 0.35713082551956177, + "rewards/rejected": -1.6023210287094116, + "step": 16580 + }, + { + "epoch": 2.8583735354927637, + "grad_norm": 40.58009338378906, + "learning_rate": 6.774266466331946e-10, + "logits/chosen": -2.2344393730163574, + "logits/rejected": -2.205369710922241, + "logps/chosen": -189.36399841308594, + "logps/rejected": -212.946044921875, + "loss": 0.6217, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -1.3462426662445068, + "rewards/margins": 0.2884506285190582, + "rewards/rejected": -1.6346931457519531, + "step": 16590 + }, + { + "epoch": 2.8600964851826327, + "grad_norm": 35.32307815551758, + "learning_rate": 6.610807011180552e-10, + "logits/chosen": -2.2636892795562744, + "logits/rejected": -2.2521121501922607, + "logps/chosen": -187.82833862304688, + "logps/rejected": -211.86575317382812, + "loss": 0.6346, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -1.3014774322509766, + "rewards/margins": 0.27267351746559143, + "rewards/rejected": -1.5741510391235352, + "step": 16600 + }, + { + "epoch": 2.8618194348725017, + "grad_norm": 30.843059539794922, + "learning_rate": 6.449330686281285e-10, + "logits/chosen": -2.257110118865967, + "logits/rejected": -2.216294050216675, + "logps/chosen": -186.468505859375, + "logps/rejected": -221.2821807861328, + "loss": 0.5793, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -1.3059688806533813, + "rewards/margins": 0.38301438093185425, + "rewards/rejected": -1.6889832019805908, + "step": 16610 + }, + { + "epoch": 2.8635423845623706, + "grad_norm": 41.82108688354492, + "learning_rate": 6.289838140672521e-10, + "logits/chosen": -2.2378902435302734, + "logits/rejected": -2.2138800621032715, + "logps/chosen": -174.65951538085938, + "logps/rejected": -203.35629272460938, + "loss": 0.6086, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -1.171541452407837, + "rewards/margins": 0.3206041753292084, + "rewards/rejected": -1.4921454191207886, + "step": 16620 + }, + { + "epoch": 2.86526533425224, + "grad_norm": 46.764320373535156, + "learning_rate": 6.132330015419296e-10, + "logits/chosen": -2.243197202682495, + "logits/rejected": -2.206315755844116, + "logps/chosen": -186.30581665039062, + "logps/rejected": -213.79360961914062, + "loss": 0.5886, + "rewards/accuracies": 0.71875, + "rewards/chosen": -1.2337594032287598, + "rewards/margins": 0.3845231831073761, + "rewards/rejected": -1.618282675743103, + "step": 16630 + }, + { + "epoch": 2.866988283942109, + "grad_norm": 50.13645553588867, + "learning_rate": 5.97680694361019e-10, + "logits/chosen": -2.2617459297180176, + "logits/rejected": -2.227799654006958, + "logps/chosen": -190.19491577148438, + "logps/rejected": -218.0207061767578, + "loss": 0.6145, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -1.3346335887908936, + "rewards/margins": 0.33890143036842346, + "rewards/rejected": -1.6735349893569946, + "step": 16640 + }, + { + "epoch": 2.868711233631978, + "grad_norm": 35.483299255371094, + "learning_rate": 5.823269550355281e-10, + "logits/chosen": -2.2349085807800293, + "logits/rejected": -2.213834047317505, + "logps/chosen": -171.34573364257812, + "logps/rejected": -202.91136169433594, + "loss": 0.5896, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -1.156395673751831, + "rewards/margins": 0.36401891708374023, + "rewards/rejected": -1.5204145908355713, + "step": 16650 + }, + { + "epoch": 2.870434183321847, + "grad_norm": 41.430599212646484, + "learning_rate": 5.671718452783247e-10, + "logits/chosen": -2.2703566551208496, + "logits/rejected": -2.253518581390381, + "logps/chosen": -179.66993713378906, + "logps/rejected": -211.7154541015625, + "loss": 0.6102, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -1.2494404315948486, + "rewards/margins": 0.33247634768486023, + "rewards/rejected": -1.5819166898727417, + "step": 16660 + }, + { + "epoch": 2.872157133011716, + "grad_norm": 31.6019229888916, + "learning_rate": 5.522154260039158e-10, + "logits/chosen": -2.2803568840026855, + "logits/rejected": -2.261735439300537, + "logps/chosen": -169.9952392578125, + "logps/rejected": -211.58175659179688, + "loss": 0.5666, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -1.168507695198059, + "rewards/margins": 0.4273078441619873, + "rewards/rejected": -1.5958155393600464, + "step": 16670 + }, + { + "epoch": 2.873880082701585, + "grad_norm": 47.5838623046875, + "learning_rate": 5.374577573281746e-10, + "logits/chosen": -2.2111976146698, + "logits/rejected": -2.194802761077881, + "logps/chosen": -176.59658813476562, + "logps/rejected": -202.6920166015625, + "loss": 0.6284, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.2688227891921997, + "rewards/margins": 0.27188339829444885, + "rewards/rejected": -1.5407060384750366, + "step": 16680 + }, + { + "epoch": 2.8756030323914543, + "grad_norm": 35.962890625, + "learning_rate": 5.228988985681416e-10, + "logits/chosen": -2.2994630336761475, + "logits/rejected": -2.2786083221435547, + "logps/chosen": -176.1400146484375, + "logps/rejected": -202.96847534179688, + "loss": 0.6151, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -1.2068394422531128, + "rewards/margins": 0.2947150766849518, + "rewards/rejected": -1.5015544891357422, + "step": 16690 + }, + { + "epoch": 2.8773259820813233, + "grad_norm": 43.47890090942383, + "learning_rate": 5.085389082417291e-10, + "logits/chosen": -2.362004280090332, + "logits/rejected": -2.339451551437378, + "logps/chosen": -183.06504821777344, + "logps/rejected": -207.77261352539062, + "loss": 0.6376, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -1.2732914686203003, + "rewards/margins": 0.27610525488853455, + "rewards/rejected": -1.5493966341018677, + "step": 16700 + }, + { + "epoch": 2.8790489317711923, + "grad_norm": 29.09917640686035, + "learning_rate": 4.943778440675451e-10, + "logits/chosen": -2.289375066757202, + "logits/rejected": -2.261080026626587, + "logps/chosen": -176.30088806152344, + "logps/rejected": -209.833984375, + "loss": 0.5974, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -1.241347312927246, + "rewards/margins": 0.3593154549598694, + "rewards/rejected": -1.6006628274917603, + "step": 16710 + }, + { + "epoch": 2.8807718814610612, + "grad_norm": 48.67392349243164, + "learning_rate": 4.804157629646144e-10, + "logits/chosen": -2.251337766647339, + "logits/rejected": -2.2181668281555176, + "logps/chosen": -179.15492248535156, + "logps/rejected": -207.46963500976562, + "loss": 0.5897, + "rewards/accuracies": 0.71875, + "rewards/chosen": -1.2366948127746582, + "rewards/margins": 0.3579455316066742, + "rewards/rejected": -1.5946404933929443, + "step": 16720 + }, + { + "epoch": 2.8824948311509306, + "grad_norm": 33.78643035888672, + "learning_rate": 4.666527210521742e-10, + "logits/chosen": -2.2321724891662598, + "logits/rejected": -2.2046022415161133, + "logps/chosen": -173.66099548339844, + "logps/rejected": -217.14547729492188, + "loss": 0.5569, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -1.211617112159729, + "rewards/margins": 0.45391637086868286, + "rewards/rejected": -1.665533423423767, + "step": 16730 + }, + { + "epoch": 2.8842177808407996, + "grad_norm": 44.23442459106445, + "learning_rate": 4.53088773649446e-10, + "logits/chosen": -2.2257206439971924, + "logits/rejected": -2.199591636657715, + "logps/chosen": -181.7847900390625, + "logps/rejected": -212.50894165039062, + "loss": 0.6114, + "rewards/accuracies": 0.65625, + "rewards/chosen": -1.2515366077423096, + "rewards/margins": 0.3494786322116852, + "rewards/rejected": -1.601015329360962, + "step": 16740 + }, + { + "epoch": 2.8859407305306686, + "grad_norm": 38.216522216796875, + "learning_rate": 4.397239752754134e-10, + "logits/chosen": -2.3091447353363037, + "logits/rejected": -2.28430438041687, + "logps/chosen": -178.2207489013672, + "logps/rejected": -219.61129760742188, + "loss": 0.571, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -1.232283353805542, + "rewards/margins": 0.3908485472202301, + "rewards/rejected": -1.6231319904327393, + "step": 16750 + }, + { + "epoch": 2.8876636802205375, + "grad_norm": 42.792030334472656, + "learning_rate": 4.265583796485783e-10, + "logits/chosen": -2.2111003398895264, + "logits/rejected": -2.182976245880127, + "logps/chosen": -188.66864013671875, + "logps/rejected": -222.423583984375, + "loss": 0.5928, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -1.3284666538238525, + "rewards/margins": 0.3791476786136627, + "rewards/rejected": -1.707614541053772, + "step": 16760 + }, + { + "epoch": 2.8893866299104065, + "grad_norm": 57.47098159790039, + "learning_rate": 4.135920396867942e-10, + "logits/chosen": -2.3359036445617676, + "logits/rejected": -2.297128200531006, + "logps/chosen": -181.53477478027344, + "logps/rejected": -212.4428253173828, + "loss": 0.5775, + "rewards/accuracies": 0.71875, + "rewards/chosen": -1.2404382228851318, + "rewards/margins": 0.3955693244934082, + "rewards/rejected": -1.6360076665878296, + "step": 16770 + }, + { + "epoch": 2.8911095796002755, + "grad_norm": 30.25149154663086, + "learning_rate": 4.0082500750701076e-10, + "logits/chosen": -2.2654218673706055, + "logits/rejected": -2.248629331588745, + "logps/chosen": -175.7464141845703, + "logps/rejected": -214.15731811523438, + "loss": 0.5822, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -1.2026407718658447, + "rewards/margins": 0.36839795112609863, + "rewards/rejected": -1.571038842201233, + "step": 16780 + }, + { + "epoch": 2.892832529290145, + "grad_norm": 27.726421356201172, + "learning_rate": 3.8825733442507947e-10, + "logits/chosen": -2.284965753555298, + "logits/rejected": -2.2358572483062744, + "logps/chosen": -179.79421997070312, + "logps/rejected": -203.9884033203125, + "loss": 0.6165, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -1.2187730073928833, + "rewards/margins": 0.32310670614242554, + "rewards/rejected": -1.5418797731399536, + "step": 16790 + }, + { + "epoch": 2.894555478980014, + "grad_norm": 70.15962219238281, + "learning_rate": 3.75889070955554e-10, + "logits/chosen": -2.3128035068511963, + "logits/rejected": -2.2996985912323, + "logps/chosen": -179.56320190429688, + "logps/rejected": -210.6813201904297, + "loss": 0.6221, + "rewards/accuracies": 0.6187499761581421, + "rewards/chosen": -1.2907289266586304, + "rewards/margins": 0.3105553686618805, + "rewards/rejected": -1.601284384727478, + "step": 16800 + }, + { + "epoch": 2.894555478980014, + "eval_logits/chosen": -2.355128288269043, + "eval_logits/rejected": -2.343019962310791, + "eval_logps/chosen": -164.93360900878906, + "eval_logps/rejected": -186.08456420898438, + "eval_loss": 0.6498690247535706, + "eval_rewards/accuracies": 0.6040892004966736, + "eval_rewards/chosen": -1.0591812133789062, + "eval_rewards/margins": 0.17416824400424957, + "eval_rewards/rejected": -1.2333494424819946, + "eval_runtime": 384.6915, + "eval_samples_per_second": 11.188, + "eval_steps_per_second": 1.399, + "step": 16800 + }, + { + "epoch": 2.896278428669883, + "grad_norm": 34.69468688964844, + "learning_rate": 3.6372026681146806e-10, + "logits/chosen": -2.260162830352783, + "logits/rejected": -2.2307093143463135, + "logps/chosen": -168.69715881347656, + "logps/rejected": -201.69468688964844, + "loss": 0.5917, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -1.14708411693573, + "rewards/margins": 0.3847166895866394, + "rewards/rejected": -1.531800627708435, + "step": 16810 + }, + { + "epoch": 2.898001378359752, + "grad_norm": 34.2429084777832, + "learning_rate": 3.517509709041688e-10, + "logits/chosen": -2.2920870780944824, + "logits/rejected": -2.263136625289917, + "logps/chosen": -183.0124969482422, + "logps/rejected": -202.57705688476562, + "loss": 0.6253, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -1.2682347297668457, + "rewards/margins": 0.2431812733411789, + "rewards/rejected": -1.5114161968231201, + "step": 16820 + }, + { + "epoch": 2.899724328049621, + "grad_norm": 46.141563415527344, + "learning_rate": 3.399812313430728e-10, + "logits/chosen": -2.3478941917419434, + "logits/rejected": -2.3191308975219727, + "logps/chosen": -192.22817993164062, + "logps/rejected": -230.08999633789062, + "loss": 0.5932, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -1.3660786151885986, + "rewards/margins": 0.3936581015586853, + "rewards/rejected": -1.7597367763519287, + "step": 16830 + }, + { + "epoch": 2.90144727773949, + "grad_norm": 37.20547866821289, + "learning_rate": 3.284110954355157e-10, + "logits/chosen": -2.2482810020446777, + "logits/rejected": -2.215100049972534, + "logps/chosen": -176.05003356933594, + "logps/rejected": -210.0874786376953, + "loss": 0.5841, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -1.2200827598571777, + "rewards/margins": 0.3753504753112793, + "rewards/rejected": -1.5954333543777466, + "step": 16840 + }, + { + "epoch": 2.903170227429359, + "grad_norm": 29.959806442260742, + "learning_rate": 3.1704060968654746e-10, + "logits/chosen": -2.29280161857605, + "logits/rejected": -2.2742457389831543, + "logps/chosen": -191.05917358398438, + "logps/rejected": -208.31741333007812, + "loss": 0.6377, + "rewards/accuracies": 0.6187499761581421, + "rewards/chosen": -1.345529317855835, + "rewards/margins": 0.24640360474586487, + "rewards/rejected": -1.5919328927993774, + "step": 16850 + }, + { + "epoch": 2.904893177119228, + "grad_norm": 40.781044006347656, + "learning_rate": 3.0586981979873747e-10, + "logits/chosen": -2.2989661693573, + "logits/rejected": -2.260462522506714, + "logps/chosen": -183.75123596191406, + "logps/rejected": -206.5366668701172, + "loss": 0.627, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -1.2734284400939941, + "rewards/margins": 0.291414350271225, + "rewards/rejected": -1.564842939376831, + "step": 16860 + }, + { + "epoch": 2.906616126809097, + "grad_norm": 44.73786544799805, + "learning_rate": 2.9489877067199185e-10, + "logits/chosen": -2.3094325065612793, + "logits/rejected": -2.2826006412506104, + "logps/chosen": -179.9336395263672, + "logps/rejected": -205.8349609375, + "loss": 0.6229, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -1.2519553899765015, + "rewards/margins": 0.3051519989967346, + "rewards/rejected": -1.5571073293685913, + "step": 16870 + }, + { + "epoch": 2.908339076498966, + "grad_norm": 49.504329681396484, + "learning_rate": 2.8412750640338654e-10, + "logits/chosen": -2.326951265335083, + "logits/rejected": -2.300234079360962, + "logps/chosen": -173.57034301757812, + "logps/rejected": -217.76400756835938, + "loss": 0.5652, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -1.231156587600708, + "rewards/margins": 0.4336206316947937, + "rewards/rejected": -1.664777159690857, + "step": 16880 + }, + { + "epoch": 2.910062026188835, + "grad_norm": 44.85762023925781, + "learning_rate": 2.7355607028698437e-10, + "logits/chosen": -2.258765935897827, + "logits/rejected": -2.242250919342041, + "logps/chosen": -179.78518676757812, + "logps/rejected": -208.6576385498047, + "loss": 0.6117, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -1.2435871362686157, + "rewards/margins": 0.31127530336380005, + "rewards/rejected": -1.5548624992370605, + "step": 16890 + }, + { + "epoch": 2.9117849758787044, + "grad_norm": 39.28889083862305, + "learning_rate": 2.6318450481365164e-10, + "logits/chosen": -2.3123373985290527, + "logits/rejected": -2.2920501232147217, + "logps/chosen": -174.97561645507812, + "logps/rejected": -210.5087890625, + "loss": 0.5735, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -1.2110536098480225, + "rewards/margins": 0.40395593643188477, + "rewards/rejected": -1.6150096654891968, + "step": 16900 + }, + { + "epoch": 2.9135079255685734, + "grad_norm": 30.90997314453125, + "learning_rate": 2.5301285167088624e-10, + "logits/chosen": -2.3353562355041504, + "logits/rejected": -2.313014507293701, + "logps/chosen": -180.96817016601562, + "logps/rejected": -211.5290985107422, + "loss": 0.6066, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -1.2850770950317383, + "rewards/margins": 0.2966782748699188, + "rewards/rejected": -1.5817553997039795, + "step": 16910 + }, + { + "epoch": 2.9152308752584424, + "grad_norm": 32.51064682006836, + "learning_rate": 2.430411517426734e-10, + "logits/chosen": -2.281481981277466, + "logits/rejected": -2.2618112564086914, + "logps/chosen": -176.6731719970703, + "logps/rejected": -202.71807861328125, + "loss": 0.6197, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -1.2086846828460693, + "rewards/margins": 0.2883586585521698, + "rewards/rejected": -1.497043490409851, + "step": 16920 + }, + { + "epoch": 2.9169538249483113, + "grad_norm": 32.26247787475586, + "learning_rate": 2.332694451092965e-10, + "logits/chosen": -2.324796676635742, + "logits/rejected": -2.3038277626037598, + "logps/chosen": -171.7364959716797, + "logps/rejected": -195.21363830566406, + "loss": 0.6096, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -1.1428351402282715, + "rewards/margins": 0.30366069078445435, + "rewards/rejected": -1.446495771408081, + "step": 16930 + }, + { + "epoch": 2.9186767746381808, + "grad_norm": 40.67167282104492, + "learning_rate": 2.2369777104718768e-10, + "logits/chosen": -2.2941434383392334, + "logits/rejected": -2.272467613220215, + "logps/chosen": -178.9009246826172, + "logps/rejected": -210.223876953125, + "loss": 0.5812, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.1943846940994263, + "rewards/margins": 0.3680204451084137, + "rewards/rejected": -1.5624051094055176, + "step": 16940 + }, + { + "epoch": 2.9203997243280497, + "grad_norm": 37.32505416870117, + "learning_rate": 2.143261680287667e-10, + "logits/chosen": -2.2652194499969482, + "logits/rejected": -2.2491161823272705, + "logps/chosen": -168.2947540283203, + "logps/rejected": -213.27236938476562, + "loss": 0.5776, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -1.1722462177276611, + "rewards/margins": 0.417537122964859, + "rewards/rejected": -1.5897831916809082, + "step": 16950 + }, + { + "epoch": 2.9221226740179187, + "grad_norm": 36.005714416503906, + "learning_rate": 2.051546737222909e-10, + "logits/chosen": -2.263373851776123, + "logits/rejected": -2.240389347076416, + "logps/chosen": -183.56320190429688, + "logps/rejected": -221.6046600341797, + "loss": 0.5857, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -1.275015950202942, + "rewards/margins": 0.39710181951522827, + "rewards/rejected": -1.6721179485321045, + "step": 16960 + }, + { + "epoch": 2.9238456237077877, + "grad_norm": 35.368194580078125, + "learning_rate": 1.9618332499169442e-10, + "logits/chosen": -2.3448400497436523, + "logits/rejected": -2.311893939971924, + "logps/chosen": -179.88156127929688, + "logps/rejected": -215.29086303710938, + "loss": 0.5784, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -1.2340666055679321, + "rewards/margins": 0.39124569296836853, + "rewards/rejected": -1.6253122091293335, + "step": 16970 + }, + { + "epoch": 2.9255685733976566, + "grad_norm": 29.743484497070312, + "learning_rate": 1.8741215789644936e-10, + "logits/chosen": -2.3190419673919678, + "logits/rejected": -2.297215700149536, + "logps/chosen": -174.7429656982422, + "logps/rejected": -202.0531463623047, + "loss": 0.6004, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -1.1987437009811401, + "rewards/margins": 0.3031441569328308, + "rewards/rejected": -1.5018876791000366, + "step": 16980 + }, + { + "epoch": 2.9272915230875256, + "grad_norm": 31.945310592651367, + "learning_rate": 1.7884120769141032e-10, + "logits/chosen": -2.3586273193359375, + "logits/rejected": -2.3274989128112793, + "logps/chosen": -165.72108459472656, + "logps/rejected": -206.23239135742188, + "loss": 0.5635, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -1.1223981380462646, + "rewards/margins": 0.4226682782173157, + "rewards/rejected": -1.545066237449646, + "step": 16990 + }, + { + "epoch": 2.929014472777395, + "grad_norm": 40.08998489379883, + "learning_rate": 1.7047050882669223e-10, + "logits/chosen": -2.355996608734131, + "logits/rejected": -2.332170009613037, + "logps/chosen": -175.21693420410156, + "logps/rejected": -202.94973754882812, + "loss": 0.6114, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -1.1910465955734253, + "rewards/margins": 0.30955350399017334, + "rewards/rejected": -1.5006000995635986, + "step": 17000 + }, + { + "epoch": 2.930737422467264, + "grad_norm": 28.414857864379883, + "learning_rate": 1.623000949475095e-10, + "logits/chosen": -2.266655445098877, + "logits/rejected": -2.245577096939087, + "logps/chosen": -188.32498168945312, + "logps/rejected": -222.77828979492188, + "loss": 0.6018, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -1.345150351524353, + "rewards/margins": 0.33940568566322327, + "rewards/rejected": -1.684556245803833, + "step": 17010 + }, + { + "epoch": 2.932460372157133, + "grad_norm": 46.6025276184082, + "learning_rate": 1.5432999889404274e-10, + "logits/chosen": -2.264037847518921, + "logits/rejected": -2.262972593307495, + "logps/chosen": -181.83615112304688, + "logps/rejected": -202.73126220703125, + "loss": 0.6667, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -1.2782509326934814, + "rewards/margins": 0.22515583038330078, + "rewards/rejected": -1.5034067630767822, + "step": 17020 + }, + { + "epoch": 2.934183321847002, + "grad_norm": 42.2696647644043, + "learning_rate": 1.4656025270133876e-10, + "logits/chosen": -2.3105950355529785, + "logits/rejected": -2.2809667587280273, + "logps/chosen": -182.8857421875, + "logps/rejected": -206.82546997070312, + "loss": 0.6195, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -1.2540470361709595, + "rewards/margins": 0.2889506220817566, + "rewards/rejected": -1.5429977178573608, + "step": 17030 + }, + { + "epoch": 2.9359062715368713, + "grad_norm": 35.74762725830078, + "learning_rate": 1.3899088759913302e-10, + "logits/chosen": -2.341332197189331, + "logits/rejected": -2.3059799671173096, + "logps/chosen": -180.80311584472656, + "logps/rejected": -213.46328735351562, + "loss": 0.5792, + "rewards/accuracies": 0.71875, + "rewards/chosen": -1.2436562776565552, + "rewards/margins": 0.38733428716659546, + "rewards/rejected": -1.6309906244277954, + "step": 17040 + }, + { + "epoch": 2.9376292212267403, + "grad_norm": 40.4786262512207, + "learning_rate": 1.316219340117608e-10, + "logits/chosen": -2.2711122035980225, + "logits/rejected": -2.2497386932373047, + "logps/chosen": -182.53744506835938, + "logps/rejected": -209.8964385986328, + "loss": 0.6232, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -1.2701737880706787, + "rewards/margins": 0.2986606955528259, + "rewards/rejected": -1.5688344240188599, + "step": 17050 + }, + { + "epoch": 2.9393521709166093, + "grad_norm": 33.46916961669922, + "learning_rate": 1.2445342155801842e-10, + "logits/chosen": -2.325920581817627, + "logits/rejected": -2.294126272201538, + "logps/chosen": -183.05918884277344, + "logps/rejected": -211.62661743164062, + "loss": 0.629, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -1.2804826498031616, + "rewards/margins": 0.29142576456069946, + "rewards/rejected": -1.5719083547592163, + "step": 17060 + }, + { + "epoch": 2.9410751206064782, + "grad_norm": 50.899085998535156, + "learning_rate": 1.1748537905105217e-10, + "logits/chosen": -2.289062976837158, + "logits/rejected": -2.2559592723846436, + "logps/chosen": -185.1095428466797, + "logps/rejected": -210.09213256835938, + "loss": 0.6258, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.290991187095642, + "rewards/margins": 0.2987528145313263, + "rewards/rejected": -1.5897438526153564, + "step": 17070 + }, + { + "epoch": 2.942798070296347, + "grad_norm": 33.59029769897461, + "learning_rate": 1.1071783449823624e-10, + "logits/chosen": -2.2389519214630127, + "logits/rejected": -2.20768666267395, + "logps/chosen": -176.70346069335938, + "logps/rejected": -223.4357147216797, + "loss": 0.5514, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -1.2281625270843506, + "rewards/margins": 0.4805348515510559, + "rewards/rejected": -1.7086973190307617, + "step": 17080 + }, + { + "epoch": 2.944521019986216, + "grad_norm": 31.26056671142578, + "learning_rate": 1.0415081510106172e-10, + "logits/chosen": -2.288973331451416, + "logits/rejected": -2.2687900066375732, + "logps/chosen": -177.4471893310547, + "logps/rejected": -215.8143310546875, + "loss": 0.5844, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -1.2379207611083984, + "rewards/margins": 0.3961821496486664, + "rewards/rejected": -1.6341030597686768, + "step": 17090 + }, + { + "epoch": 2.9462439696760856, + "grad_norm": 34.02079772949219, + "learning_rate": 9.778434725503105e-11, + "logits/chosen": -2.2463839054107666, + "logits/rejected": -2.2128617763519287, + "logps/chosen": -183.8589630126953, + "logps/rejected": -215.9231719970703, + "loss": 0.5856, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.2714749574661255, + "rewards/margins": 0.3676893413066864, + "rewards/rejected": -1.6391643285751343, + "step": 17100 + }, + { + "epoch": 2.9479669193659546, + "grad_norm": 32.88734817504883, + "learning_rate": 9.161845654954703e-11, + "logits/chosen": -2.2507004737854004, + "logits/rejected": -2.238379716873169, + "logps/chosen": -195.37942504882812, + "logps/rejected": -225.2381591796875, + "loss": 0.6197, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -1.3950939178466797, + "rewards/margins": 0.32841694355010986, + "rewards/rejected": -1.723510980606079, + "step": 17110 + }, + { + "epoch": 2.9496898690558235, + "grad_norm": 37.77426528930664, + "learning_rate": 8.565316776780739e-11, + "logits/chosen": -2.3413896560668945, + "logits/rejected": -2.30137300491333, + "logps/chosen": -176.1731414794922, + "logps/rejected": -216.9041748046875, + "loss": 0.5323, + "rewards/accuracies": 0.793749988079071, + "rewards/chosen": -1.2267433404922485, + "rewards/margins": 0.46331986784935, + "rewards/rejected": -1.690063238143921, + "step": 17120 + }, + { + "epoch": 2.9514128187456925, + "grad_norm": 40.9581413269043, + "learning_rate": 7.988850488672705e-11, + "logits/chosen": -2.2292721271514893, + "logits/rejected": -2.1956279277801514, + "logps/chosen": -183.31048583984375, + "logps/rejected": -217.1750030517578, + "loss": 0.5832, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": -1.2756314277648926, + "rewards/margins": 0.36559030413627625, + "rewards/rejected": -1.6412217617034912, + "step": 17130 + }, + { + "epoch": 2.953135768435562, + "grad_norm": 32.19672775268555, + "learning_rate": 7.432449107679928e-11, + "logits/chosen": -2.234701633453369, + "logits/rejected": -2.2178635597229004, + "logps/chosen": -174.7184295654297, + "logps/rejected": -208.18563842773438, + "loss": 0.5853, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -1.1894357204437256, + "rewards/margins": 0.3720587193965912, + "rewards/rejected": -1.5614944696426392, + "step": 17140 + }, + { + "epoch": 2.954858718125431, + "grad_norm": 42.60933303833008, + "learning_rate": 6.896114870204583e-11, + "logits/chosen": -2.339540958404541, + "logits/rejected": -2.3109090328216553, + "logps/chosen": -184.4068603515625, + "logps/rejected": -211.8623504638672, + "loss": 0.6216, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -1.2877216339111328, + "rewards/margins": 0.31411486864089966, + "rewards/rejected": -1.6018365621566772, + "step": 17150 + }, + { + "epoch": 2.9565816678153, + "grad_norm": 29.023967742919922, + "learning_rate": 6.379849931990034e-11, + "logits/chosen": -2.338467597961426, + "logits/rejected": -2.3185648918151855, + "logps/chosen": -172.90652465820312, + "logps/rejected": -225.8275604248047, + "loss": 0.5377, + "rewards/accuracies": 0.7562500238418579, + "rewards/chosen": -1.2059366703033447, + "rewards/margins": 0.539142370223999, + "rewards/rejected": -1.7450790405273438, + "step": 17160 + }, + { + "epoch": 2.958304617505169, + "grad_norm": 29.838197708129883, + "learning_rate": 5.883656368114164e-11, + "logits/chosen": -2.3640499114990234, + "logits/rejected": -2.339353561401367, + "logps/chosen": -194.65211486816406, + "logps/rejected": -218.7022705078125, + "loss": 0.6556, + "rewards/accuracies": 0.6187499761581421, + "rewards/chosen": -1.3461074829101562, + "rewards/margins": 0.27067288756370544, + "rewards/rejected": -1.6167805194854736, + "step": 17170 + }, + { + "epoch": 2.960027567195038, + "grad_norm": 39.5936279296875, + "learning_rate": 5.407536172978844e-11, + "logits/chosen": -2.311352491378784, + "logits/rejected": -2.289877414703369, + "logps/chosen": -179.19358825683594, + "logps/rejected": -213.298583984375, + "loss": 0.605, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -1.2765612602233887, + "rewards/margins": 0.34954506158828735, + "rewards/rejected": -1.6261062622070312, + "step": 17180 + }, + { + "epoch": 2.9617505168849068, + "grad_norm": 37.97072219848633, + "learning_rate": 4.951491260302698e-11, + "logits/chosen": -2.2549374103546143, + "logits/rejected": -2.238830327987671, + "logps/chosen": -174.09535217285156, + "logps/rejected": -204.37635803222656, + "loss": 0.606, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -1.2118583917617798, + "rewards/margins": 0.31206992268562317, + "rewards/rejected": -1.523928165435791, + "step": 17190 + }, + { + "epoch": 2.963473466574776, + "grad_norm": 59.260013580322266, + "learning_rate": 4.515523463115012e-11, + "logits/chosen": -2.312384605407715, + "logits/rejected": -2.298529624938965, + "logps/chosen": -172.6897430419922, + "logps/rejected": -202.76080322265625, + "loss": 0.6096, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -1.1744720935821533, + "rewards/margins": 0.3171740174293518, + "rewards/rejected": -1.4916460514068604, + "step": 17200 + }, + { + "epoch": 2.963473466574776, + "eval_logits/chosen": -2.3549132347106934, + "eval_logits/rejected": -2.342874765396118, + "eval_logps/chosen": -164.96139526367188, + "eval_logps/rejected": -186.09048461914062, + "eval_loss": 0.650018572807312, + "eval_rewards/accuracies": 0.6045538783073425, + "eval_rewards/chosen": -1.0594593286514282, + "eval_rewards/margins": 0.1739494800567627, + "eval_rewards/rejected": -1.2334089279174805, + "eval_runtime": 384.5491, + "eval_samples_per_second": 11.192, + "eval_steps_per_second": 1.399, + "step": 17200 + }, + { + "epoch": 2.965196416264645, + "grad_norm": 41.53389358520508, + "learning_rate": 4.099634533745733e-11, + "logits/chosen": -2.2500195503234863, + "logits/rejected": -2.231739044189453, + "logps/chosen": -182.5985107421875, + "logps/rejected": -217.8794708251953, + "loss": 0.6055, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -1.2984097003936768, + "rewards/margins": 0.3526950180530548, + "rewards/rejected": -1.6511045694351196, + "step": 17210 + }, + { + "epoch": 2.966919365954514, + "grad_norm": 27.601787567138672, + "learning_rate": 3.7038261438204765e-11, + "logits/chosen": -2.289471387863159, + "logits/rejected": -2.269460678100586, + "logps/chosen": -182.10348510742188, + "logps/rejected": -233.2564239501953, + "loss": 0.5604, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.2569857835769653, + "rewards/margins": 0.51573246717453, + "rewards/rejected": -1.7727181911468506, + "step": 17220 + }, + { + "epoch": 2.968642315644383, + "grad_norm": 33.00059127807617, + "learning_rate": 3.3280998842527554e-11, + "logits/chosen": -2.2976396083831787, + "logits/rejected": -2.2868614196777344, + "logps/chosen": -178.9959259033203, + "logps/rejected": -215.16806030273438, + "loss": 0.6023, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.2521908283233643, + "rewards/margins": 0.3561028242111206, + "rewards/rejected": -1.6082935333251953, + "step": 17230 + }, + { + "epoch": 2.9703652653342525, + "grad_norm": 34.8737907409668, + "learning_rate": 2.972457265237871e-11, + "logits/chosen": -2.2938859462738037, + "logits/rejected": -2.2738187313079834, + "logps/chosen": -169.763427734375, + "logps/rejected": -201.87765502929688, + "loss": 0.5938, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -1.1499814987182617, + "rewards/margins": 0.326345831155777, + "rewards/rejected": -1.4763273000717163, + "step": 17240 + }, + { + "epoch": 2.9720882150241215, + "grad_norm": 35.13932800292969, + "learning_rate": 2.6368997162479202e-11, + "logits/chosen": -2.272411823272705, + "logits/rejected": -2.2502591609954834, + "logps/chosen": -176.018310546875, + "logps/rejected": -217.7426300048828, + "loss": 0.5582, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -1.2261368036270142, + "rewards/margins": 0.4333557188510895, + "rewards/rejected": -1.6594922542572021, + "step": 17250 + }, + { + "epoch": 2.9738111647139904, + "grad_norm": 42.27689743041992, + "learning_rate": 2.321428586022911e-11, + "logits/chosen": -2.2786245346069336, + "logits/rejected": -2.2477970123291016, + "logps/chosen": -174.90060424804688, + "logps/rejected": -213.00875854492188, + "loss": 0.567, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": -1.1894375085830688, + "rewards/margins": 0.4287734031677246, + "rewards/rejected": -1.618210792541504, + "step": 17260 + }, + { + "epoch": 2.9755341144038594, + "grad_norm": 40.46282958984375, + "learning_rate": 2.0260451425690994e-11, + "logits/chosen": -2.2892489433288574, + "logits/rejected": -2.260471820831299, + "logps/chosen": -185.9062042236328, + "logps/rejected": -212.2075653076172, + "loss": 0.6475, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -1.2983613014221191, + "rewards/margins": 0.28522801399230957, + "rewards/rejected": -1.5835893154144287, + "step": 17270 + }, + { + "epoch": 2.9772570640937284, + "grad_norm": 32.98335647583008, + "learning_rate": 1.7507505731523266e-11, + "logits/chosen": -2.2438535690307617, + "logits/rejected": -2.2073891162872314, + "logps/chosen": -176.59791564941406, + "logps/rejected": -214.287109375, + "loss": 0.5699, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -1.2137374877929688, + "rewards/margins": 0.4162743091583252, + "rewards/rejected": -1.6300119161605835, + "step": 17280 + }, + { + "epoch": 2.9789800137835973, + "grad_norm": 29.91464614868164, + "learning_rate": 1.4955459842913576e-11, + "logits/chosen": -2.3043110370635986, + "logits/rejected": -2.2792234420776367, + "logps/chosen": -183.82244873046875, + "logps/rejected": -214.74813842773438, + "loss": 0.5871, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -1.2343107461929321, + "rewards/margins": 0.3532152771949768, + "rewards/rejected": -1.5875260829925537, + "step": 17290 + }, + { + "epoch": 2.9807029634734663, + "grad_norm": 56.498451232910156, + "learning_rate": 1.2604324017573276e-11, + "logits/chosen": -2.3131518363952637, + "logits/rejected": -2.297375440597534, + "logps/chosen": -183.20748901367188, + "logps/rejected": -206.3766632080078, + "loss": 0.6271, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -1.265207052230835, + "rewards/margins": 0.28281646966934204, + "rewards/rejected": -1.5480234622955322, + "step": 17300 + }, + { + "epoch": 2.9824259131633357, + "grad_norm": 45.164241790771484, + "learning_rate": 1.0454107705665238e-11, + "logits/chosen": -2.3076233863830566, + "logits/rejected": -2.2815582752227783, + "logps/chosen": -180.4513397216797, + "logps/rejected": -215.8129119873047, + "loss": 0.582, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -1.2481348514556885, + "rewards/margins": 0.3904893100261688, + "rewards/rejected": -1.6386241912841797, + "step": 17310 + }, + { + "epoch": 2.9841488628532047, + "grad_norm": 39.80954360961914, + "learning_rate": 8.504819549770559e-12, + "logits/chosen": -2.3062615394592285, + "logits/rejected": -2.284247398376465, + "logps/chosen": -181.01290893554688, + "logps/rejected": -204.4337921142578, + "loss": 0.6206, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -1.2549407482147217, + "rewards/margins": 0.27550196647644043, + "rewards/rejected": -1.5304428339004517, + "step": 17320 + }, + { + "epoch": 2.9858718125430737, + "grad_norm": 45.717803955078125, + "learning_rate": 6.7564673848719e-12, + "logits/chosen": -2.255821943283081, + "logits/rejected": -2.2316794395446777, + "logps/chosen": -180.52059936523438, + "logps/rejected": -203.00363159179688, + "loss": 0.6294, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.237790822982788, + "rewards/margins": 0.28286388516426086, + "rewards/rejected": -1.5206546783447266, + "step": 17330 + }, + { + "epoch": 2.987594762232943, + "grad_norm": 26.13838768005371, + "learning_rate": 5.2090582382924295e-12, + "logits/chosen": -2.31354022026062, + "logits/rejected": -2.2810709476470947, + "logps/chosen": -174.29434204101562, + "logps/rejected": -211.77145385742188, + "loss": 0.5811, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -1.2048596143722534, + "rewards/margins": 0.3996252119541168, + "rewards/rejected": -1.6044849157333374, + "step": 17340 + }, + { + "epoch": 2.989317711922812, + "grad_norm": 34.028167724609375, + "learning_rate": 3.8625983297069234e-12, + "logits/chosen": -2.320079803466797, + "logits/rejected": -2.2873146533966064, + "logps/chosen": -171.4598846435547, + "logps/rejected": -200.25634765625, + "loss": 0.597, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -1.1712242364883423, + "rewards/margins": 0.32024580240249634, + "rewards/rejected": -1.4914699792861938, + "step": 17350 + }, + { + "epoch": 2.991040661612681, + "grad_norm": 43.01903533935547, + "learning_rate": 2.7170930710695983e-12, + "logits/chosen": -2.3500120639801025, + "logits/rejected": -2.3285441398620605, + "logps/chosen": -177.98741149902344, + "logps/rejected": -217.0200653076172, + "loss": 0.589, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -1.227912187576294, + "rewards/margins": 0.400569349527359, + "rewards/rejected": -1.6284816265106201, + "step": 17360 + }, + { + "epoch": 2.99276361130255, + "grad_norm": 38.99733352661133, + "learning_rate": 1.7725470666363208e-12, + "logits/chosen": -2.285942792892456, + "logits/rejected": -2.2686188220977783, + "logps/chosen": -180.58023071289062, + "logps/rejected": -207.6968231201172, + "loss": 0.6286, + "rewards/accuracies": 0.65625, + "rewards/chosen": -1.2742855548858643, + "rewards/margins": 0.26797252893447876, + "rewards/rejected": -1.5422580242156982, + "step": 17370 + }, + { + "epoch": 2.994486560992419, + "grad_norm": 54.05454635620117, + "learning_rate": 1.0289641129146431e-12, + "logits/chosen": -2.268444776535034, + "logits/rejected": -2.254495620727539, + "logps/chosen": -182.00112915039062, + "logps/rejected": -209.5324249267578, + "loss": 0.6173, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -1.284103512763977, + "rewards/margins": 0.27302271127700806, + "rewards/rejected": -1.5571261644363403, + "step": 17380 + }, + { + "epoch": 2.996209510682288, + "grad_norm": 41.577232360839844, + "learning_rate": 4.863471986693568e-13, + "logits/chosen": -2.2996535301208496, + "logits/rejected": -2.279010057449341, + "logps/chosen": -180.03443908691406, + "logps/rejected": -210.28857421875, + "loss": 0.6006, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -1.2460159063339233, + "rewards/margins": 0.3277485966682434, + "rewards/rejected": -1.5737645626068115, + "step": 17390 + }, + { + "epoch": 2.997932460372157, + "grad_norm": 30.558481216430664, + "learning_rate": 1.4469850488918467e-13, + "logits/chosen": -2.2963578701019287, + "logits/rejected": -2.2678630352020264, + "logps/chosen": -181.9180450439453, + "logps/rejected": -216.5481719970703, + "loss": 0.585, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -1.2563611268997192, + "rewards/margins": 0.4204491078853607, + "rewards/rejected": -1.6768100261688232, + "step": 17400 + }, + { + "epoch": 2.9996554100620263, + "grad_norm": 52.21319580078125, + "learning_rate": 4.019404797883652e-15, + "logits/chosen": -2.2540431022644043, + "logits/rejected": -2.228407621383667, + "logps/chosen": -172.69688415527344, + "logps/rejected": -216.9202880859375, + "loss": 0.5487, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -1.1783651113510132, + "rewards/margins": 0.4475262761116028, + "rewards/rejected": -1.6258913278579712, + "step": 17410 + }, + { + "epoch": 3.0, + "step": 17412, + "total_flos": 0.0, + "train_loss": 0.6300815686244623, + "train_runtime": 86182.1351, + "train_samples_per_second": 3.232, + "train_steps_per_second": 0.202 + } + ], + "logging_steps": 10, + "max_steps": 17412, + "num_input_tokens_seen": 0, + "num_train_epochs": 3, + "save_steps": 400, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 0.0, + "train_batch_size": 8, + "trial_name": null, + "trial_params": null +}