{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 100, "global_step": 37, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "debug/policy_chosen_logits": -0.8575258255004883, "debug/policy_chosen_logps": -145.75564575195312, "debug/policy_rejected_logits": -0.7912808060646057, "debug/policy_rejected_logps": -155.6038818359375, "debug/reference_chosen_logps": -145.75564575195312, "debug/reference_rejected_logps": -155.6038818359375, "epoch": 0.02702702702702703, "grad_norm": 10.590267262290878, "learning_rate": 1e-06, "logits/chosen": -0.8575258255004883, "logits/rejected": -0.7912808060646057, "logps/chosen": -145.75564575195312, "logps/rejected": -155.6038818359375, "loss": 0.5, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 1 }, { "debug/policy_chosen_logits": -0.7469339966773987, "debug/policy_chosen_logps": -161.53433227539062, "debug/policy_rejected_logits": -0.7776355147361755, "debug/policy_rejected_logps": -170.75521850585938, "debug/reference_chosen_logps": -161.53152465820312, "debug/reference_rejected_logps": -170.4493408203125, "epoch": 0.05405405405405406, "grad_norm": 10.000972104848039, "learning_rate": 1e-06, "logits/chosen": -0.7469339966773987, "logits/rejected": -0.7776355147361755, "logps/chosen": -161.53433227539062, "logps/rejected": -170.75521850585938, "loss": 0.4999, "rewards/accuracies": 0.875, "rewards/chosen": -2.8095149900764227e-05, "rewards/margins": 0.0030306337866932154, "rewards/rejected": -0.0030587289948016405, "step": 2 }, { "debug/policy_chosen_logits": -0.8981359004974365, "debug/policy_chosen_logps": -176.5548095703125, "debug/policy_rejected_logits": -0.8165794014930725, "debug/policy_rejected_logps": -174.51219177246094, "debug/reference_chosen_logps": -176.45700073242188, "debug/reference_rejected_logps": -174.48855590820312, "epoch": 0.08108108108108109, "grad_norm": 11.901363041409127, "learning_rate": 1e-06, "logits/chosen": -0.8981359004974365, "logits/rejected": -0.8165794014930725, "logps/chosen": -176.5548095703125, "logps/rejected": -174.51219177246094, "loss": 0.4991, "rewards/accuracies": 0.5, "rewards/chosen": -0.0009781360859051347, "rewards/margins": -0.000741882249712944, "rewards/rejected": -0.00023625371977686882, "step": 3 }, { "debug/policy_chosen_logits": -0.9096183776855469, "debug/policy_chosen_logps": -166.55648803710938, "debug/policy_rejected_logits": -0.9066538214683533, "debug/policy_rejected_logps": -164.07363891601562, "debug/reference_chosen_logps": -166.54132080078125, "debug/reference_rejected_logps": -164.5047607421875, "epoch": 0.10810810810810811, "grad_norm": 11.906594144254457, "learning_rate": 1e-06, "logits/chosen": -0.9096183776855469, "logits/rejected": -0.9066538214683533, "logps/chosen": -166.55648803710938, "logps/rejected": -164.07363891601562, "loss": 0.4985, "rewards/accuracies": 0.5, "rewards/chosen": -0.00015165336662903428, "rewards/margins": -0.004462923854589462, "rewards/rejected": 0.004311270546168089, "step": 4 }, { "debug/policy_chosen_logits": -0.7255207896232605, "debug/policy_chosen_logps": -197.11146545410156, "debug/policy_rejected_logits": -0.6460945010185242, "debug/policy_rejected_logps": -199.1307830810547, "debug/reference_chosen_logps": -196.3956298828125, "debug/reference_rejected_logps": -198.6553192138672, "epoch": 0.13513513513513514, "grad_norm": 12.022558068888802, "learning_rate": 1e-06, "logits/chosen": -0.7255207896232605, "logits/rejected": -0.6460945010185242, "logps/chosen": -197.11146545410156, "logps/rejected": -199.1307830810547, "loss": 0.4968, "rewards/accuracies": 0.375, "rewards/chosen": -0.007158536929637194, "rewards/margins": -0.00240384042263031, "rewards/rejected": -0.004754695575684309, "step": 5 }, { "debug/policy_chosen_logits": -0.9493230581283569, "debug/policy_chosen_logps": -129.1173553466797, "debug/policy_rejected_logits": -0.8156340718269348, "debug/policy_rejected_logps": -164.30892944335938, "debug/reference_chosen_logps": -127.74217987060547, "debug/reference_rejected_logps": -162.74935913085938, "epoch": 0.16216216216216217, "grad_norm": 12.824448923481688, "learning_rate": 1e-06, "logits/chosen": -0.9493230581283569, "logits/rejected": -0.8156340718269348, "logps/chosen": -129.1173553466797, "logps/rejected": -164.30892944335938, "loss": 0.5049, "rewards/accuracies": 0.625, "rewards/chosen": -0.013751697726547718, "rewards/margins": 0.0018438897095620632, "rewards/rejected": -0.015595588833093643, "step": 6 }, { "debug/policy_chosen_logits": -0.7272260785102844, "debug/policy_chosen_logps": -184.9543914794922, "debug/policy_rejected_logits": -0.6780456900596619, "debug/policy_rejected_logps": -187.98831176757812, "debug/reference_chosen_logps": -184.20895385742188, "debug/reference_rejected_logps": -188.1205291748047, "epoch": 0.1891891891891892, "grad_norm": 12.347680907504191, "learning_rate": 1e-06, "logits/chosen": -0.7272260785102844, "logits/rejected": -0.6780456900596619, "logps/chosen": -184.9543914794922, "logps/rejected": -187.98831176757812, "loss": 0.4986, "rewards/accuracies": 0.375, "rewards/chosen": -0.007454394828528166, "rewards/margins": -0.008776684291660786, "rewards/rejected": 0.0013222885318100452, "step": 7 }, { "debug/policy_chosen_logits": -0.8364517688751221, "debug/policy_chosen_logps": -189.10472106933594, "debug/policy_rejected_logits": -0.7381677627563477, "debug/policy_rejected_logps": -188.10775756835938, "debug/reference_chosen_logps": -187.6288299560547, "debug/reference_rejected_logps": -188.3019256591797, "epoch": 0.21621621621621623, "grad_norm": 12.404458206833132, "learning_rate": 1e-06, "logits/chosen": -0.8364517688751221, "logits/rejected": -0.7381677627563477, "logps/chosen": -189.10472106933594, "logps/rejected": -188.10775756835938, "loss": 0.4937, "rewards/accuracies": 0.375, "rewards/chosen": -0.014758807606995106, "rewards/margins": -0.016700536012649536, "rewards/rejected": 0.0019417284056544304, "step": 8 }, { "debug/policy_chosen_logits": -0.9120941758155823, "debug/policy_chosen_logps": -163.34133911132812, "debug/policy_rejected_logits": -0.976243257522583, "debug/policy_rejected_logps": -167.19322204589844, "debug/reference_chosen_logps": -161.68980407714844, "debug/reference_rejected_logps": -164.67279052734375, "epoch": 0.24324324324324326, "grad_norm": 12.727135168469278, "learning_rate": 1e-06, "logits/chosen": -0.9120941758155823, "logits/rejected": -0.976243257522583, "logps/chosen": -163.34133911132812, "logps/rejected": -167.19322204589844, "loss": 0.4976, "rewards/accuracies": 0.5, "rewards/chosen": -0.0165153406560421, "rewards/margins": 0.008688842877745628, "rewards/rejected": -0.025204181671142578, "step": 9 }, { "debug/policy_chosen_logits": -0.760608434677124, "debug/policy_chosen_logps": -172.08404541015625, "debug/policy_rejected_logits": -0.7949759364128113, "debug/policy_rejected_logps": -178.0618133544922, "debug/reference_chosen_logps": -171.69947814941406, "debug/reference_rejected_logps": -175.53970336914062, "epoch": 0.2702702702702703, "grad_norm": 12.198398917852398, "learning_rate": 1e-06, "logits/chosen": -0.760608434677124, "logits/rejected": -0.7949759364128113, "logps/chosen": -172.08404541015625, "logps/rejected": -178.0618133544922, "loss": 0.49, "rewards/accuracies": 0.875, "rewards/chosen": -0.003845730097964406, "rewards/margins": 0.021375417709350586, "rewards/rejected": -0.025221146643161774, "step": 10 }, { "debug/policy_chosen_logits": -0.7727924585342407, "debug/policy_chosen_logps": -174.89260864257812, "debug/policy_rejected_logits": -0.8414075970649719, "debug/policy_rejected_logps": -191.7485809326172, "debug/reference_chosen_logps": -174.308349609375, "debug/reference_rejected_logps": -189.44308471679688, "epoch": 0.2972972972972973, "grad_norm": 12.869944279488237, "learning_rate": 1e-06, "logits/chosen": -0.7727924585342407, "logits/rejected": -0.8414075970649719, "logps/chosen": -174.89260864257812, "logps/rejected": -191.7485809326172, "loss": 0.4961, "rewards/accuracies": 0.75, "rewards/chosen": -0.005842561833560467, "rewards/margins": 0.0172123983502388, "rewards/rejected": -0.02305496111512184, "step": 11 }, { "debug/policy_chosen_logits": -0.9232946038246155, "debug/policy_chosen_logps": -172.6126708984375, "debug/policy_rejected_logits": -0.8749657273292542, "debug/policy_rejected_logps": -176.91111755371094, "debug/reference_chosen_logps": -170.90977478027344, "debug/reference_rejected_logps": -176.2742156982422, "epoch": 0.32432432432432434, "grad_norm": 13.637899725211922, "learning_rate": 1e-06, "logits/chosen": -0.9232946038246155, "logits/rejected": -0.8749657273292542, "logps/chosen": -172.6126708984375, "logps/rejected": -176.91111755371094, "loss": 0.5005, "rewards/accuracies": 0.625, "rewards/chosen": -0.017028970643877983, "rewards/margins": -0.010659895837306976, "rewards/rejected": -0.006369075272232294, "step": 12 }, { "debug/policy_chosen_logits": -0.7241289615631104, "debug/policy_chosen_logps": -170.33370971679688, "debug/policy_rejected_logits": -0.7508945465087891, "debug/policy_rejected_logps": -174.8129119873047, "debug/reference_chosen_logps": -169.210205078125, "debug/reference_rejected_logps": -175.19602966308594, "epoch": 0.35135135135135137, "grad_norm": 12.678047953411022, "learning_rate": 1e-06, "logits/chosen": -0.7241289615631104, "logits/rejected": -0.7508945465087891, "logps/chosen": -170.33370971679688, "logps/rejected": -174.8129119873047, "loss": 0.4867, "rewards/accuracies": 0.125, "rewards/chosen": -0.011235074140131474, "rewards/margins": -0.015066290274262428, "rewards/rejected": 0.0038312142714858055, "step": 13 }, { "debug/policy_chosen_logits": -0.9043007493019104, "debug/policy_chosen_logps": -165.7388916015625, "debug/policy_rejected_logits": -0.8851659893989563, "debug/policy_rejected_logps": -185.2967529296875, "debug/reference_chosen_logps": -169.01324462890625, "debug/reference_rejected_logps": -184.21755981445312, "epoch": 0.3783783783783784, "grad_norm": 14.228799593284224, "learning_rate": 1e-06, "logits/chosen": -0.9043007493019104, "logits/rejected": -0.8851659893989563, "logps/chosen": -165.7388916015625, "logps/rejected": -185.2967529296875, "loss": 0.4838, "rewards/accuracies": 0.875, "rewards/chosen": 0.032743629068136215, "rewards/margins": 0.04353557527065277, "rewards/rejected": -0.010791949927806854, "step": 14 }, { "debug/policy_chosen_logits": -0.8794234991073608, "debug/policy_chosen_logps": -190.51846313476562, "debug/policy_rejected_logits": -0.9416622519493103, "debug/policy_rejected_logps": -182.75303649902344, "debug/reference_chosen_logps": -191.22064208984375, "debug/reference_rejected_logps": -180.401611328125, "epoch": 0.40540540540540543, "grad_norm": 13.867700853582742, "learning_rate": 1e-06, "logits/chosen": -0.8794234991073608, "logits/rejected": -0.9416622519493103, "logps/chosen": -190.51846313476562, "logps/rejected": -182.75303649902344, "loss": 0.5002, "rewards/accuracies": 0.875, "rewards/chosen": 0.0070218658074736595, "rewards/margins": 0.03053615428507328, "rewards/rejected": -0.023514289408922195, "step": 15 }, { "debug/policy_chosen_logits": -0.9163352251052856, "debug/policy_chosen_logps": -161.2454833984375, "debug/policy_rejected_logits": -0.8506691455841064, "debug/policy_rejected_logps": -174.76438903808594, "debug/reference_chosen_logps": -164.75534057617188, "debug/reference_rejected_logps": -172.7283172607422, "epoch": 0.43243243243243246, "grad_norm": 15.950956103697601, "learning_rate": 1e-06, "logits/chosen": -0.9163352251052856, "logits/rejected": -0.8506691455841064, "logps/chosen": -161.2454833984375, "logps/rejected": -174.76438903808594, "loss": 0.4953, "rewards/accuracies": 0.75, "rewards/chosen": 0.035098638385534286, "rewards/margins": 0.055459294468164444, "rewards/rejected": -0.020360659807920456, "step": 16 }, { "debug/policy_chosen_logits": -0.886226236820221, "debug/policy_chosen_logps": -152.4619598388672, "debug/policy_rejected_logits": -0.8994572162628174, "debug/policy_rejected_logps": -186.78121948242188, "debug/reference_chosen_logps": -153.10696411132812, "debug/reference_rejected_logps": -188.9115447998047, "epoch": 0.4594594594594595, "grad_norm": 14.555594379538805, "learning_rate": 1e-06, "logits/chosen": -0.886226236820221, "logits/rejected": -0.8994572162628174, "logps/chosen": -152.4619598388672, "logps/rejected": -186.78121948242188, "loss": 0.4997, "rewards/accuracies": 0.375, "rewards/chosen": 0.006449948064982891, "rewards/margins": -0.014853332191705704, "rewards/rejected": 0.02130328118801117, "step": 17 }, { "debug/policy_chosen_logits": -0.791405200958252, "debug/policy_chosen_logps": -184.97439575195312, "debug/policy_rejected_logits": -0.7767256498336792, "debug/policy_rejected_logps": -168.33358764648438, "debug/reference_chosen_logps": -185.62191772460938, "debug/reference_rejected_logps": -161.90869140625, "epoch": 0.4864864864864865, "grad_norm": 17.299703458305174, "learning_rate": 1e-06, "logits/chosen": -0.791405200958252, "logits/rejected": -0.7767256498336792, "logps/chosen": -184.97439575195312, "logps/rejected": -168.33358764648438, "loss": 0.4931, "rewards/accuracies": 0.75, "rewards/chosen": 0.006475199945271015, "rewards/margins": 0.0707239881157875, "rewards/rejected": -0.06424878537654877, "step": 18 }, { "debug/policy_chosen_logits": -0.8800061345100403, "debug/policy_chosen_logps": -148.43475341796875, "debug/policy_rejected_logits": -0.9134210348129272, "debug/policy_rejected_logps": -142.7111358642578, "debug/reference_chosen_logps": -149.74551391601562, "debug/reference_rejected_logps": -140.6129150390625, "epoch": 0.5135135135135135, "grad_norm": 15.389494933483391, "learning_rate": 1e-06, "logits/chosen": -0.8800061345100403, "logits/rejected": -0.9134210348129272, "logps/chosen": -148.43475341796875, "logps/rejected": -142.7111358642578, "loss": 0.4953, "rewards/accuracies": 0.75, "rewards/chosen": 0.013107641600072384, "rewards/margins": 0.03408981114625931, "rewards/rejected": -0.02098216861486435, "step": 19 }, { "debug/policy_chosen_logits": -0.95091712474823, "debug/policy_chosen_logps": -152.49203491210938, "debug/policy_rejected_logits": -0.8285200595855713, "debug/policy_rejected_logps": -152.87184143066406, "debug/reference_chosen_logps": -151.83273315429688, "debug/reference_rejected_logps": -152.75054931640625, "epoch": 0.5405405405405406, "grad_norm": 13.903822325437991, "learning_rate": 1e-06, "logits/chosen": -0.95091712474823, "logits/rejected": -0.8285200595855713, "logps/chosen": -152.49203491210938, "logps/rejected": -152.87184143066406, "loss": 0.5003, "rewards/accuracies": 0.5, "rewards/chosen": -0.006593028549104929, "rewards/margins": -0.005380069836974144, "rewards/rejected": -0.0012129591777920723, "step": 20 }, { "debug/policy_chosen_logits": -0.8724645972251892, "debug/policy_chosen_logps": -142.1439666748047, "debug/policy_rejected_logits": -0.7575433850288391, "debug/policy_rejected_logps": -175.90309143066406, "debug/reference_chosen_logps": -140.2176513671875, "debug/reference_rejected_logps": -174.95127868652344, "epoch": 0.5675675675675675, "grad_norm": 14.060737834168805, "learning_rate": 1e-06, "logits/chosen": -0.8724645972251892, "logits/rejected": -0.7575433850288391, "logps/chosen": -142.1439666748047, "logps/rejected": -175.90309143066406, "loss": 0.4882, "rewards/accuracies": 0.5, "rewards/chosen": -0.01926323026418686, "rewards/margins": -0.00974507350474596, "rewards/rejected": -0.009518155828118324, "step": 21 }, { "debug/policy_chosen_logits": -1.0412858724594116, "debug/policy_chosen_logps": -135.55929565429688, "debug/policy_rejected_logits": -1.00175142288208, "debug/policy_rejected_logps": -188.136962890625, "debug/reference_chosen_logps": -135.7158203125, "debug/reference_rejected_logps": -186.81320190429688, "epoch": 0.5945945945945946, "grad_norm": 14.183550914621547, "learning_rate": 1e-06, "logits/chosen": -1.0412858724594116, "logits/rejected": -1.00175142288208, "logps/chosen": -135.55929565429688, "logps/rejected": -188.136962890625, "loss": 0.4911, "rewards/accuracies": 0.625, "rewards/chosen": 0.0015652086585760117, "rewards/margins": 0.014802752062678337, "rewards/rejected": -0.013237543404102325, "step": 22 }, { "debug/policy_chosen_logits": -0.8892878293991089, "debug/policy_chosen_logps": -152.1015625, "debug/policy_rejected_logits": -0.7090870141983032, "debug/policy_rejected_logps": -186.6959686279297, "debug/reference_chosen_logps": -147.5272216796875, "debug/reference_rejected_logps": -184.30201721191406, "epoch": 0.6216216216216216, "grad_norm": 14.276125809311413, "learning_rate": 1e-06, "logits/chosen": -0.8892878293991089, "logits/rejected": -0.7090870141983032, "logps/chosen": -152.1015625, "logps/rejected": -186.6959686279297, "loss": 0.5004, "rewards/accuracies": 0.5, "rewards/chosen": -0.04574331268668175, "rewards/margins": -0.021803725510835648, "rewards/rejected": -0.0239395871758461, "step": 23 }, { "debug/policy_chosen_logits": -0.8802600502967834, "debug/policy_chosen_logps": -159.21139526367188, "debug/policy_rejected_logits": -0.9364652037620544, "debug/policy_rejected_logps": -164.08383178710938, "debug/reference_chosen_logps": -155.6630859375, "debug/reference_rejected_logps": -161.748291015625, "epoch": 0.6486486486486487, "grad_norm": 15.603022958174197, "learning_rate": 1e-06, "logits/chosen": -0.8802600502967834, "logits/rejected": -0.9364652037620544, "logps/chosen": -159.21139526367188, "logps/rejected": -164.08383178710938, "loss": 0.4987, "rewards/accuracies": 0.375, "rewards/chosen": -0.035483140498399734, "rewards/margins": -0.012127798981964588, "rewards/rejected": -0.02335534058511257, "step": 24 }, { "debug/policy_chosen_logits": -0.9097840785980225, "debug/policy_chosen_logps": -161.55003356933594, "debug/policy_rejected_logits": -0.8685249090194702, "debug/policy_rejected_logps": -179.54681396484375, "debug/reference_chosen_logps": -162.00999450683594, "debug/reference_rejected_logps": -181.08807373046875, "epoch": 0.6756756756756757, "grad_norm": 14.622466903613363, "learning_rate": 1e-06, "logits/chosen": -0.9097840785980225, "logits/rejected": -0.8685249090194702, "logps/chosen": -161.55003356933594, "logps/rejected": -179.54681396484375, "loss": 0.5001, "rewards/accuracies": 0.5, "rewards/chosen": 0.004599475301802158, "rewards/margins": -0.010813076049089432, "rewards/rejected": 0.015412550419569016, "step": 25 }, { "debug/policy_chosen_logits": -0.8982373476028442, "debug/policy_chosen_logps": -155.50433349609375, "debug/policy_rejected_logits": -0.8736176490783691, "debug/policy_rejected_logps": -176.95404052734375, "debug/reference_chosen_logps": -155.57803344726562, "debug/reference_rejected_logps": -176.65240478515625, "epoch": 0.7027027027027027, "grad_norm": 14.148910548336614, "learning_rate": 1e-06, "logits/chosen": -0.8982373476028442, "logits/rejected": -0.8736176490783691, "logps/chosen": -155.50433349609375, "logps/rejected": -176.95404052734375, "loss": 0.5059, "rewards/accuracies": 0.625, "rewards/chosen": 0.0007370477542281151, "rewards/margins": 0.003753413911908865, "rewards/rejected": -0.003016366856172681, "step": 26 }, { "debug/policy_chosen_logits": -0.9832875728607178, "debug/policy_chosen_logps": -159.749267578125, "debug/policy_rejected_logits": -0.9118414521217346, "debug/policy_rejected_logps": -151.69076538085938, "debug/reference_chosen_logps": -162.98480224609375, "debug/reference_rejected_logps": -154.2532501220703, "epoch": 0.7297297297297297, "grad_norm": 15.17612475288469, "learning_rate": 1e-06, "logits/chosen": -0.9832875728607178, "logits/rejected": -0.9118414521217346, "logps/chosen": -159.749267578125, "logps/rejected": -151.69076538085938, "loss": 0.5099, "rewards/accuracies": 0.375, "rewards/chosen": 0.03235547989606857, "rewards/margins": 0.006730623543262482, "rewards/rejected": 0.02562485635280609, "step": 27 }, { "debug/policy_chosen_logits": -0.8673918843269348, "debug/policy_chosen_logps": -147.43087768554688, "debug/policy_rejected_logits": -0.7803842425346375, "debug/policy_rejected_logps": -188.6826629638672, "debug/reference_chosen_logps": -148.56741333007812, "debug/reference_rejected_logps": -186.77401733398438, "epoch": 0.7567567567567568, "grad_norm": 13.774005492067339, "learning_rate": 1e-06, "logits/chosen": -0.8673918843269348, "logits/rejected": -0.7803842425346375, "logps/chosen": -147.43087768554688, "logps/rejected": -188.6826629638672, "loss": 0.4971, "rewards/accuracies": 0.5, "rewards/chosen": 0.01136524323374033, "rewards/margins": 0.030451610684394836, "rewards/rejected": -0.01908636838197708, "step": 28 }, { "debug/policy_chosen_logits": -0.8602281808853149, "debug/policy_chosen_logps": -153.77182006835938, "debug/policy_rejected_logits": -0.8865491151809692, "debug/policy_rejected_logps": -184.97218322753906, "debug/reference_chosen_logps": -154.79705810546875, "debug/reference_rejected_logps": -183.60633850097656, "epoch": 0.7837837837837838, "grad_norm": 15.257213384975808, "learning_rate": 1e-06, "logits/chosen": -0.8602281808853149, "logits/rejected": -0.8865491151809692, "logps/chosen": -153.77182006835938, "logps/rejected": -184.97218322753906, "loss": 0.5138, "rewards/accuracies": 0.625, "rewards/chosen": 0.01025250181555748, "rewards/margins": 0.02391086146235466, "rewards/rejected": -0.01365836150944233, "step": 29 }, { "debug/policy_chosen_logits": -0.9129707217216492, "debug/policy_chosen_logps": -184.99423217773438, "debug/policy_rejected_logits": -1.0919617414474487, "debug/policy_rejected_logps": -141.35853576660156, "debug/reference_chosen_logps": -188.018798828125, "debug/reference_rejected_logps": -136.7606201171875, "epoch": 0.8108108108108109, "grad_norm": 13.614612194522516, "learning_rate": 1e-06, "logits/chosen": -0.9129707217216492, "logits/rejected": -1.0919617414474487, "logps/chosen": -184.99423217773438, "logps/rejected": -141.35853576660156, "loss": 0.4793, "rewards/accuracies": 0.75, "rewards/chosen": 0.030245695263147354, "rewards/margins": 0.07622484117746353, "rewards/rejected": -0.04597914591431618, "step": 30 }, { "debug/policy_chosen_logits": -0.7870268225669861, "debug/policy_chosen_logps": -157.5154266357422, "debug/policy_rejected_logits": -0.8374965190887451, "debug/policy_rejected_logps": -162.17874145507812, "debug/reference_chosen_logps": -159.80447387695312, "debug/reference_rejected_logps": -163.95941162109375, "epoch": 0.8378378378378378, "grad_norm": 13.069040617397022, "learning_rate": 1e-06, "logits/chosen": -0.7870268225669861, "logits/rejected": -0.8374965190887451, "logps/chosen": -157.5154266357422, "logps/rejected": -162.17874145507812, "loss": 0.48, "rewards/accuracies": 0.625, "rewards/chosen": 0.022890347987413406, "rewards/margins": 0.0050835697911679745, "rewards/rejected": 0.017806777730584145, "step": 31 }, { "debug/policy_chosen_logits": -0.8162547945976257, "debug/policy_chosen_logps": -171.15731811523438, "debug/policy_rejected_logits": -0.8792607188224792, "debug/policy_rejected_logps": -170.6051483154297, "debug/reference_chosen_logps": -174.41046142578125, "debug/reference_rejected_logps": -171.5959930419922, "epoch": 0.8648648648648649, "grad_norm": 13.409519517707912, "learning_rate": 1e-06, "logits/chosen": -0.8162547945976257, "logits/rejected": -0.8792607188224792, "logps/chosen": -171.15731811523438, "logps/rejected": -170.6051483154297, "loss": 0.4797, "rewards/accuracies": 0.75, "rewards/chosen": 0.03253144025802612, "rewards/margins": 0.02262299507856369, "rewards/rejected": 0.009908447973430157, "step": 32 }, { "debug/policy_chosen_logits": -0.8389750719070435, "debug/policy_chosen_logps": -130.0693359375, "debug/policy_rejected_logits": -0.8284645676612854, "debug/policy_rejected_logps": -172.84576416015625, "debug/reference_chosen_logps": -128.82723999023438, "debug/reference_rejected_logps": -173.13807678222656, "epoch": 0.8918918918918919, "grad_norm": 14.49702969131408, "learning_rate": 1e-06, "logits/chosen": -0.8389750719070435, "logits/rejected": -0.8284645676612854, "logps/chosen": -130.0693359375, "logps/rejected": -172.84576416015625, "loss": 0.5019, "rewards/accuracies": 0.25, "rewards/chosen": -0.01242092065513134, "rewards/margins": -0.01534400973469019, "rewards/rejected": 0.0029230881482362747, "step": 33 }, { "debug/policy_chosen_logits": -0.7313972115516663, "debug/policy_chosen_logps": -206.6120147705078, "debug/policy_rejected_logits": -0.8341861367225647, "debug/policy_rejected_logps": -174.07289123535156, "debug/reference_chosen_logps": -204.1991729736328, "debug/reference_rejected_logps": -172.9614715576172, "epoch": 0.918918918918919, "grad_norm": 13.848425223482339, "learning_rate": 1e-06, "logits/chosen": -0.7313972115516663, "logits/rejected": -0.8341861367225647, "logps/chosen": -206.6120147705078, "logps/rejected": -174.07289123535156, "loss": 0.4918, "rewards/accuracies": 0.375, "rewards/chosen": -0.024128342047333717, "rewards/margins": -0.013014238327741623, "rewards/rejected": -0.01111410278826952, "step": 34 }, { "debug/policy_chosen_logits": -0.7511980533599854, "debug/policy_chosen_logps": -169.01324462890625, "debug/policy_rejected_logits": -0.8619469404220581, "debug/policy_rejected_logps": -155.92356872558594, "debug/reference_chosen_logps": -174.2823028564453, "debug/reference_rejected_logps": -151.06353759765625, "epoch": 0.9459459459459459, "grad_norm": 13.804422393772251, "learning_rate": 1e-06, "logits/chosen": -0.7511980533599854, "logits/rejected": -0.8619469404220581, "logps/chosen": -169.01324462890625, "logps/rejected": -155.92356872558594, "loss": 0.491, "rewards/accuracies": 0.875, "rewards/chosen": 0.052690617740154266, "rewards/margins": 0.10129091143608093, "rewards/rejected": -0.04860030114650726, "step": 35 }, { "debug/policy_chosen_logits": -0.9176344275474548, "debug/policy_chosen_logps": -142.12106323242188, "debug/policy_rejected_logits": -0.8687180876731873, "debug/policy_rejected_logps": -179.96939086914062, "debug/reference_chosen_logps": -143.38665771484375, "debug/reference_rejected_logps": -179.182861328125, "epoch": 0.972972972972973, "grad_norm": 12.092026182468153, "learning_rate": 1e-06, "logits/chosen": -0.9176344275474548, "logits/rejected": -0.8687180876731873, "logps/chosen": -142.12106323242188, "logps/rejected": -179.96939086914062, "loss": 0.4918, "rewards/accuracies": 0.625, "rewards/chosen": 0.012656106613576412, "rewards/margins": 0.02052140235900879, "rewards/rejected": -0.007865296676754951, "step": 36 }, { "debug/policy_chosen_logits": -0.8388864398002625, "debug/policy_chosen_logps": -190.0416717529297, "debug/policy_rejected_logits": -0.9188035726547241, "debug/policy_rejected_logps": -160.5929412841797, "debug/reference_chosen_logps": -190.74612426757812, "debug/reference_rejected_logps": -159.9268798828125, "epoch": 1.0, "grad_norm": 12.215368730436227, "learning_rate": 1e-06, "logits/chosen": -0.8388864398002625, "logits/rejected": -0.9188035726547241, "logps/chosen": -190.0416717529297, "logps/rejected": -160.5929412841797, "loss": 0.4773, "rewards/accuracies": 0.625, "rewards/chosen": 0.00704436469823122, "rewards/margins": 0.013704795390367508, "rewards/rejected": -0.006660431623458862, "step": 37 }, { "epoch": 1.0, "step": 37, "total_flos": 0.0, "train_loss": 0.4953597542401907, "train_runtime": 139.3771, "train_samples_per_second": 16.897, "train_steps_per_second": 0.265 } ], "logging_steps": 1, "max_steps": 37, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 8, "trial_name": null, "trial_params": null }