{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 100, "global_step": 41, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "debug/policy_chosen_logits": -1.0657888650894165, "debug/policy_chosen_logps": -135.17166137695312, "debug/policy_rejected_logits": -1.0082604885101318, "debug/policy_rejected_logps": -164.67491149902344, "debug/reference_chosen_logps": -135.17166137695312, "debug/reference_rejected_logps": -164.67491149902344, "epoch": 0.024390243902439025, "grad_norm": 7.455705642028753, "learning_rate": 1e-06, "logits/chosen": -1.0657888650894165, "logits/rejected": -1.0082604885101318, "logps/chosen": -135.17166137695312, "logps/rejected": -164.67491149902344, "loss": 0.5, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 1 }, { "debug/policy_chosen_logits": -0.9316875338554382, "debug/policy_chosen_logps": -168.0986785888672, "debug/policy_rejected_logits": -0.9154044389724731, "debug/policy_rejected_logps": -152.3076934814453, "debug/reference_chosen_logps": -168.41038513183594, "debug/reference_rejected_logps": -152.78643798828125, "epoch": 0.04878048780487805, "grad_norm": 7.29758593491183, "learning_rate": 1e-06, "logits/chosen": -0.9316875338554382, "logits/rejected": -0.9154044389724731, "logps/chosen": -168.0986785888672, "logps/rejected": -152.3076934814453, "loss": 0.4997, "rewards/accuracies": 0.375, "rewards/chosen": 0.003117017913609743, "rewards/margins": -0.001670417725108564, "rewards/rejected": 0.004787435755133629, "step": 2 }, { "debug/policy_chosen_logits": -1.1543760299682617, "debug/policy_chosen_logps": -162.00730895996094, "debug/policy_rejected_logits": -0.9926377534866333, "debug/policy_rejected_logps": -148.77609252929688, "debug/reference_chosen_logps": -161.8112030029297, "debug/reference_rejected_logps": -148.79986572265625, "epoch": 0.07317073170731707, "grad_norm": 7.862425079439745, "learning_rate": 1e-06, "logits/chosen": -1.1543760299682617, "logits/rejected": -0.9926377534866333, "logps/chosen": -162.00730895996094, "logps/rejected": -148.77609252929688, "loss": 0.4992, "rewards/accuracies": 0.5, "rewards/chosen": -0.0019610594026744366, "rewards/margins": -0.0021987915970385075, "rewards/rejected": 0.00023773196153342724, "step": 3 }, { "debug/policy_chosen_logits": -0.9400157332420349, "debug/policy_chosen_logps": -156.6750030517578, "debug/policy_rejected_logits": -1.0246046781539917, "debug/policy_rejected_logps": -140.80905151367188, "debug/reference_chosen_logps": -156.9263153076172, "debug/reference_rejected_logps": -141.17587280273438, "epoch": 0.0975609756097561, "grad_norm": 8.32764259608944, "learning_rate": 1e-06, "logits/chosen": -0.9400157332420349, "logits/rejected": -1.0246046781539917, "logps/chosen": -156.6750030517578, "logps/rejected": -140.80905151367188, "loss": 0.4991, "rewards/accuracies": 0.5, "rewards/chosen": 0.0025131413713097572, "rewards/margins": -0.0011551285861060023, "rewards/rejected": 0.0036682700738310814, "step": 4 }, { "debug/policy_chosen_logits": -0.8135665059089661, "debug/policy_chosen_logps": -180.49978637695312, "debug/policy_rejected_logits": -0.966259241104126, "debug/policy_rejected_logps": -180.0769500732422, "debug/reference_chosen_logps": -180.33763122558594, "debug/reference_rejected_logps": -180.13702392578125, "epoch": 0.12195121951219512, "grad_norm": 9.5687290716109, "learning_rate": 1e-06, "logits/chosen": -0.8135665059089661, "logits/rejected": -0.966259241104126, "logps/chosen": -180.49978637695312, "logps/rejected": -180.0769500732422, "loss": 0.5012, "rewards/accuracies": 0.5, "rewards/chosen": -0.0016215991927310824, "rewards/margins": -0.002222432754933834, "rewards/rejected": 0.0006008340278640389, "step": 5 }, { "debug/policy_chosen_logits": -0.875511109828949, "debug/policy_chosen_logps": -169.20863342285156, "debug/policy_rejected_logits": -0.8739838600158691, "debug/policy_rejected_logps": -161.75338745117188, "debug/reference_chosen_logps": -168.50558471679688, "debug/reference_rejected_logps": -160.78341674804688, "epoch": 0.14634146341463414, "grad_norm": 8.253308110293828, "learning_rate": 1e-06, "logits/chosen": -0.875511109828949, "logits/rejected": -0.8739838600158691, "logps/chosen": -169.20863342285156, "logps/rejected": -161.75338745117188, "loss": 0.5028, "rewards/accuracies": 0.625, "rewards/chosen": -0.007030477747321129, "rewards/margins": 0.0026691434904932976, "rewards/rejected": -0.009699621237814426, "step": 6 }, { "debug/policy_chosen_logits": -0.9064666628837585, "debug/policy_chosen_logps": -163.56883239746094, "debug/policy_rejected_logits": -0.88660728931427, "debug/policy_rejected_logps": -167.95022583007812, "debug/reference_chosen_logps": -162.90780639648438, "debug/reference_rejected_logps": -166.8964080810547, "epoch": 0.17073170731707318, "grad_norm": 9.148150186683857, "learning_rate": 1e-06, "logits/chosen": -0.9064666628837585, "logits/rejected": -0.88660728931427, "logps/chosen": -163.56883239746094, "logps/rejected": -167.95022583007812, "loss": 0.5021, "rewards/accuracies": 0.625, "rewards/chosen": -0.0066101741977036, "rewards/margins": 0.0039279647171497345, "rewards/rejected": -0.010538139380514622, "step": 7 }, { "debug/policy_chosen_logits": -1.0406209230422974, "debug/policy_chosen_logps": -145.71910095214844, "debug/policy_rejected_logits": -0.9474231004714966, "debug/policy_rejected_logps": -176.6788330078125, "debug/reference_chosen_logps": -145.38890075683594, "debug/reference_rejected_logps": -176.26510620117188, "epoch": 0.1951219512195122, "grad_norm": 7.873026337514701, "learning_rate": 1e-06, "logits/chosen": -1.0406209230422974, "logits/rejected": -0.9474231004714966, "logps/chosen": -145.71910095214844, "logps/rejected": -176.6788330078125, "loss": 0.4983, "rewards/accuracies": 0.375, "rewards/chosen": -0.003301925491541624, "rewards/margins": 0.0008353232406079769, "rewards/rejected": -0.004137249197810888, "step": 8 }, { "debug/policy_chosen_logits": -0.7725314497947693, "debug/policy_chosen_logps": -191.3946533203125, "debug/policy_rejected_logits": -0.9446115493774414, "debug/policy_rejected_logps": -157.00643920898438, "debug/reference_chosen_logps": -191.58938598632812, "debug/reference_rejected_logps": -157.259521484375, "epoch": 0.21951219512195122, "grad_norm": 8.113917191223486, "learning_rate": 1e-06, "logits/chosen": -0.7725314497947693, "logits/rejected": -0.9446115493774414, "logps/chosen": -191.3946533203125, "logps/rejected": -157.00643920898438, "loss": 0.4984, "rewards/accuracies": 0.625, "rewards/chosen": 0.0019471454434096813, "rewards/margins": -0.0005835914053022861, "rewards/rejected": 0.00253073638305068, "step": 9 }, { "debug/policy_chosen_logits": -0.9539018273353577, "debug/policy_chosen_logps": -156.33084106445312, "debug/policy_rejected_logits": -0.8661950826644897, "debug/policy_rejected_logps": -166.2339630126953, "debug/reference_chosen_logps": -156.4985809326172, "debug/reference_rejected_logps": -165.98721313476562, "epoch": 0.24390243902439024, "grad_norm": 8.579040246656625, "learning_rate": 1e-06, "logits/chosen": -0.9539018273353577, "logits/rejected": -0.8661950826644897, "logps/chosen": -156.33084106445312, "logps/rejected": -166.2339630126953, "loss": 0.502, "rewards/accuracies": 0.75, "rewards/chosen": 0.001677446300163865, "rewards/margins": 0.0041448017582297325, "rewards/rejected": -0.002467355690896511, "step": 10 }, { "debug/policy_chosen_logits": -0.9540179371833801, "debug/policy_chosen_logps": -182.48159790039062, "debug/policy_rejected_logits": -0.9883304834365845, "debug/policy_rejected_logps": -147.31826782226562, "debug/reference_chosen_logps": -184.1331024169922, "debug/reference_rejected_logps": -149.22958374023438, "epoch": 0.2682926829268293, "grad_norm": 8.658564154949419, "learning_rate": 1e-06, "logits/chosen": -0.9540179371833801, "logits/rejected": -0.9883304834365845, "logps/chosen": -182.48159790039062, "logps/rejected": -147.31826782226562, "loss": 0.4978, "rewards/accuracies": 0.375, "rewards/chosen": 0.016515053808689117, "rewards/margins": -0.0025980568025261164, "rewards/rejected": 0.01911311037838459, "step": 11 }, { "debug/policy_chosen_logits": -0.9231241345405579, "debug/policy_chosen_logps": -173.3875274658203, "debug/policy_rejected_logits": -0.9746717810630798, "debug/policy_rejected_logps": -186.23223876953125, "debug/reference_chosen_logps": -173.79617309570312, "debug/reference_rejected_logps": -185.99221801757812, "epoch": 0.2926829268292683, "grad_norm": 9.641106516800372, "learning_rate": 1e-06, "logits/chosen": -0.9231241345405579, "logits/rejected": -0.9746717810630798, "logps/chosen": -173.3875274658203, "logps/rejected": -186.23223876953125, "loss": 0.4994, "rewards/accuracies": 0.75, "rewards/chosen": 0.0040865708142519, "rewards/margins": 0.006486768834292889, "rewards/rejected": -0.0024001982528716326, "step": 12 }, { "debug/policy_chosen_logits": -0.8564068675041199, "debug/policy_chosen_logps": -155.58351135253906, "debug/policy_rejected_logits": -0.78647381067276, "debug/policy_rejected_logps": -180.34735107421875, "debug/reference_chosen_logps": -155.46743774414062, "debug/reference_rejected_logps": -179.4715576171875, "epoch": 0.3170731707317073, "grad_norm": 7.778587819967809, "learning_rate": 1e-06, "logits/chosen": -0.8564068675041199, "logits/rejected": -0.78647381067276, "logps/chosen": -155.58351135253906, "logps/rejected": -180.34735107421875, "loss": 0.4997, "rewards/accuracies": 0.5, "rewards/chosen": -0.0011608502827584743, "rewards/margins": 0.007597027346491814, "rewards/rejected": -0.008757877163589, "step": 13 }, { "debug/policy_chosen_logits": -0.9291447997093201, "debug/policy_chosen_logps": -137.7774658203125, "debug/policy_rejected_logits": -0.8161742687225342, "debug/policy_rejected_logps": -154.59149169921875, "debug/reference_chosen_logps": -136.43026733398438, "debug/reference_rejected_logps": -152.96876525878906, "epoch": 0.34146341463414637, "grad_norm": 9.452087496130199, "learning_rate": 1e-06, "logits/chosen": -0.9291447997093201, "logits/rejected": -0.8161742687225342, "logps/chosen": -137.7774658203125, "logps/rejected": -154.59149169921875, "loss": 0.4991, "rewards/accuracies": 0.5, "rewards/chosen": -0.013471984304487705, "rewards/margins": 0.002755412831902504, "rewards/rejected": -0.016227398067712784, "step": 14 }, { "debug/policy_chosen_logits": -0.8452920317649841, "debug/policy_chosen_logps": -175.22425842285156, "debug/policy_rejected_logits": -0.7951302528381348, "debug/policy_rejected_logps": -175.16729736328125, "debug/reference_chosen_logps": -174.3819122314453, "debug/reference_rejected_logps": -172.59144592285156, "epoch": 0.36585365853658536, "grad_norm": 8.577281209186165, "learning_rate": 1e-06, "logits/chosen": -0.8452920317649841, "logits/rejected": -0.7951302528381348, "logps/chosen": -175.22425842285156, "logps/rejected": -175.16729736328125, "loss": 0.4999, "rewards/accuracies": 0.75, "rewards/chosen": -0.008423404768109322, "rewards/margins": 0.017334945499897003, "rewards/rejected": -0.025758352130651474, "step": 15 }, { "debug/policy_chosen_logits": -0.8255157470703125, "debug/policy_chosen_logps": -196.6995391845703, "debug/policy_rejected_logits": -1.0168753862380981, "debug/policy_rejected_logps": -144.28012084960938, "debug/reference_chosen_logps": -194.49815368652344, "debug/reference_rejected_logps": -141.6895751953125, "epoch": 0.3902439024390244, "grad_norm": 10.569580209063629, "learning_rate": 1e-06, "logits/chosen": -0.8255157470703125, "logits/rejected": -1.0168753862380981, "logps/chosen": -196.6995391845703, "logps/rejected": -144.28012084960938, "loss": 0.5055, "rewards/accuracies": 0.5, "rewards/chosen": -0.02201385423541069, "rewards/margins": 0.003891811240464449, "rewards/rejected": -0.025905665010213852, "step": 16 }, { "debug/policy_chosen_logits": -0.8963515162467957, "debug/policy_chosen_logps": -167.45254516601562, "debug/policy_rejected_logits": -1.1090606451034546, "debug/policy_rejected_logps": -132.12088012695312, "debug/reference_chosen_logps": -165.4241943359375, "debug/reference_rejected_logps": -131.6002655029297, "epoch": 0.4146341463414634, "grad_norm": 8.216685819750042, "learning_rate": 1e-06, "logits/chosen": -0.8963515162467957, "logits/rejected": -1.1090606451034546, "logps/chosen": -167.45254516601562, "logps/rejected": -132.12088012695312, "loss": 0.5023, "rewards/accuracies": 0.375, "rewards/chosen": -0.020283488556742668, "rewards/margins": -0.015077323652803898, "rewards/rejected": -0.00520616490393877, "step": 17 }, { "debug/policy_chosen_logits": -1.1278187036514282, "debug/policy_chosen_logps": -163.56838989257812, "debug/policy_rejected_logits": -0.7523326277732849, "debug/policy_rejected_logps": -194.7676239013672, "debug/reference_chosen_logps": -164.0474395751953, "debug/reference_rejected_logps": -195.21795654296875, "epoch": 0.43902439024390244, "grad_norm": 8.12497169240311, "learning_rate": 1e-06, "logits/chosen": -1.1278187036514282, "logits/rejected": -0.7523326277732849, "logps/chosen": -163.56838989257812, "logps/rejected": -194.7676239013672, "loss": 0.5046, "rewards/accuracies": 0.5, "rewards/chosen": 0.004790563136339188, "rewards/margins": 0.0002871984615921974, "rewards/rejected": 0.00450336467474699, "step": 18 }, { "debug/policy_chosen_logits": -0.8814194202423096, "debug/policy_chosen_logps": -187.4921417236328, "debug/policy_rejected_logits": -0.9221204519271851, "debug/policy_rejected_logps": -184.78768920898438, "debug/reference_chosen_logps": -187.89466857910156, "debug/reference_rejected_logps": -186.1348876953125, "epoch": 0.4634146341463415, "grad_norm": 8.289942810390798, "learning_rate": 1e-06, "logits/chosen": -0.8814194202423096, "logits/rejected": -0.9221204519271851, "logps/chosen": -187.4921417236328, "logps/rejected": -184.78768920898438, "loss": 0.4986, "rewards/accuracies": 0.375, "rewards/chosen": 0.004025382921099663, "rewards/margins": -0.009446544572710991, "rewards/rejected": 0.013471927493810654, "step": 19 }, { "debug/policy_chosen_logits": -0.9697524905204773, "debug/policy_chosen_logps": -142.56431579589844, "debug/policy_rejected_logits": -0.7087419033050537, "debug/policy_rejected_logps": -181.8319091796875, "debug/reference_chosen_logps": -145.44419860839844, "debug/reference_rejected_logps": -184.153564453125, "epoch": 0.4878048780487805, "grad_norm": 9.078292331479483, "learning_rate": 1e-06, "logits/chosen": -0.9697524905204773, "logits/rejected": -0.7087419033050537, "logps/chosen": -142.56431579589844, "logps/rejected": -181.8319091796875, "loss": 0.4974, "rewards/accuracies": 0.625, "rewards/chosen": 0.028798799961805344, "rewards/margins": 0.005582190118730068, "rewards/rejected": 0.0232166089117527, "step": 20 }, { "debug/policy_chosen_logits": -0.8631553649902344, "debug/policy_chosen_logps": -181.8375701904297, "debug/policy_rejected_logits": -0.8101003170013428, "debug/policy_rejected_logps": -204.69305419921875, "debug/reference_chosen_logps": -181.6129150390625, "debug/reference_rejected_logps": -206.70823669433594, "epoch": 0.5121951219512195, "grad_norm": 9.63889554364432, "learning_rate": 1e-06, "logits/chosen": -0.8631553649902344, "logits/rejected": -0.8101003170013428, "logps/chosen": -181.8375701904297, "logps/rejected": -204.69305419921875, "loss": 0.5031, "rewards/accuracies": 0.125, "rewards/chosen": -0.0022465987130999565, "rewards/margins": -0.02239835634827614, "rewards/rejected": 0.020151756703853607, "step": 21 }, { "debug/policy_chosen_logits": -0.900942862033844, "debug/policy_chosen_logps": -156.7996826171875, "debug/policy_rejected_logits": -0.7672395706176758, "debug/policy_rejected_logps": -190.6102294921875, "debug/reference_chosen_logps": -157.8141326904297, "debug/reference_rejected_logps": -191.65597534179688, "epoch": 0.5365853658536586, "grad_norm": 8.86160952811184, "learning_rate": 1e-06, "logits/chosen": -0.900942862033844, "logits/rejected": -0.7672395706176758, "logps/chosen": -156.7996826171875, "logps/rejected": -190.6102294921875, "loss": 0.4989, "rewards/accuracies": 0.375, "rewards/chosen": 0.010144614614546299, "rewards/margins": -0.0003129197284579277, "rewards/rejected": 0.010457534343004227, "step": 22 }, { "debug/policy_chosen_logits": -0.8332923054695129, "debug/policy_chosen_logps": -155.85830688476562, "debug/policy_rejected_logits": -0.7771956324577332, "debug/policy_rejected_logps": -165.27749633789062, "debug/reference_chosen_logps": -156.92462158203125, "debug/reference_rejected_logps": -165.16729736328125, "epoch": 0.5609756097560976, "grad_norm": 8.143779892532098, "learning_rate": 1e-06, "logits/chosen": -0.8332923054695129, "logits/rejected": -0.7771956324577332, "logps/chosen": -155.85830688476562, "logps/rejected": -165.27749633789062, "loss": 0.5003, "rewards/accuracies": 0.625, "rewards/chosen": 0.0106631089001894, "rewards/margins": 0.011765326373279095, "rewards/rejected": -0.001102218870073557, "step": 23 }, { "debug/policy_chosen_logits": -1.01643705368042, "debug/policy_chosen_logps": -142.14419555664062, "debug/policy_rejected_logits": -0.9811778664588928, "debug/policy_rejected_logps": -161.79629516601562, "debug/reference_chosen_logps": -141.1592559814453, "debug/reference_rejected_logps": -159.3284149169922, "epoch": 0.5853658536585366, "grad_norm": 8.084865068688668, "learning_rate": 1e-06, "logits/chosen": -1.01643705368042, "logits/rejected": -0.9811778664588928, "logps/chosen": -142.14419555664062, "logps/rejected": -161.79629516601562, "loss": 0.5022, "rewards/accuracies": 0.875, "rewards/chosen": -0.009849337860941887, "rewards/margins": 0.014829404652118683, "rewards/rejected": -0.02467874437570572, "step": 24 }, { "debug/policy_chosen_logits": -0.7919837236404419, "debug/policy_chosen_logps": -162.75352478027344, "debug/policy_rejected_logits": -0.96061772108078, "debug/policy_rejected_logps": -169.0644073486328, "debug/reference_chosen_logps": -161.2084503173828, "debug/reference_rejected_logps": -167.456787109375, "epoch": 0.6097560975609756, "grad_norm": 9.111682216370516, "learning_rate": 1e-06, "logits/chosen": -0.7919837236404419, "logits/rejected": -0.96061772108078, "logps/chosen": -162.75352478027344, "logps/rejected": -169.0644073486328, "loss": 0.5016, "rewards/accuracies": 0.5, "rewards/chosen": -0.015450754202902317, "rewards/margins": 0.0006255432963371277, "rewards/rejected": -0.01607629843056202, "step": 25 }, { "debug/policy_chosen_logits": -0.9698644876480103, "debug/policy_chosen_logps": -165.7355194091797, "debug/policy_rejected_logits": -0.7252393364906311, "debug/policy_rejected_logps": -183.15887451171875, "debug/reference_chosen_logps": -164.15481567382812, "debug/reference_rejected_logps": -182.38124084472656, "epoch": 0.6341463414634146, "grad_norm": 10.115977003883836, "learning_rate": 1e-06, "logits/chosen": -0.9698644876480103, "logits/rejected": -0.7252393364906311, "logps/chosen": -165.7355194091797, "logps/rejected": -183.15887451171875, "loss": 0.502, "rewards/accuracies": 0.5, "rewards/chosen": -0.015806876122951508, "rewards/margins": -0.008030500262975693, "rewards/rejected": -0.00777637492865324, "step": 26 }, { "debug/policy_chosen_logits": -0.7707141637802124, "debug/policy_chosen_logps": -205.21841430664062, "debug/policy_rejected_logits": -0.9271693229675293, "debug/policy_rejected_logps": -168.64834594726562, "debug/reference_chosen_logps": -205.48428344726562, "debug/reference_rejected_logps": -168.75538635253906, "epoch": 0.6585365853658537, "grad_norm": 9.132022098455337, "learning_rate": 1e-06, "logits/chosen": -0.7707141637802124, "logits/rejected": -0.9271693229675293, "logps/chosen": -205.21841430664062, "logps/rejected": -168.64834594726562, "loss": 0.4964, "rewards/accuracies": 0.625, "rewards/chosen": 0.002658710815012455, "rewards/margins": 0.0015882402658462524, "rewards/rejected": 0.0010704714804887772, "step": 27 }, { "debug/policy_chosen_logits": -0.9867036938667297, "debug/policy_chosen_logps": -170.24920654296875, "debug/policy_rejected_logits": -0.958678126335144, "debug/policy_rejected_logps": -172.13462829589844, "debug/reference_chosen_logps": -169.69461059570312, "debug/reference_rejected_logps": -171.46791076660156, "epoch": 0.6829268292682927, "grad_norm": 8.165335264629883, "learning_rate": 1e-06, "logits/chosen": -0.9867036938667297, "logits/rejected": -0.958678126335144, "logps/chosen": -170.24920654296875, "logps/rejected": -172.13462829589844, "loss": 0.4934, "rewards/accuracies": 0.375, "rewards/chosen": -0.005545825697481632, "rewards/margins": 0.0011212928220629692, "rewards/rejected": -0.006667118053883314, "step": 28 }, { "debug/policy_chosen_logits": -0.6699000000953674, "debug/policy_chosen_logps": -189.43618774414062, "debug/policy_rejected_logits": -0.8863887786865234, "debug/policy_rejected_logps": -179.74417114257812, "debug/reference_chosen_logps": -188.31427001953125, "debug/reference_rejected_logps": -179.5847625732422, "epoch": 0.7073170731707317, "grad_norm": 8.679731931376267, "learning_rate": 1e-06, "logits/chosen": -0.6699000000953674, "logits/rejected": -0.8863887786865234, "logps/chosen": -189.43618774414062, "logps/rejected": -179.74417114257812, "loss": 0.5078, "rewards/accuracies": 0.375, "rewards/chosen": -0.011219177395105362, "rewards/margins": -0.009625071659684181, "rewards/rejected": -0.0015941057354211807, "step": 29 }, { "debug/policy_chosen_logits": -0.9941346645355225, "debug/policy_chosen_logps": -139.71783447265625, "debug/policy_rejected_logits": -0.9757084846496582, "debug/policy_rejected_logps": -147.81222534179688, "debug/reference_chosen_logps": -140.21261596679688, "debug/reference_rejected_logps": -150.31747436523438, "epoch": 0.7317073170731707, "grad_norm": 8.152161947975058, "learning_rate": 1e-06, "logits/chosen": -0.9941346645355225, "logits/rejected": -0.9757084846496582, "logps/chosen": -139.71783447265625, "logps/rejected": -147.81222534179688, "loss": 0.4989, "rewards/accuracies": 0.375, "rewards/chosen": 0.0049479007720947266, "rewards/margins": -0.020104561001062393, "rewards/rejected": 0.02505246177315712, "step": 30 }, { "debug/policy_chosen_logits": -0.9950529932975769, "debug/policy_chosen_logps": -167.46485900878906, "debug/policy_rejected_logits": -0.9336649179458618, "debug/policy_rejected_logps": -161.46160888671875, "debug/reference_chosen_logps": -170.01150512695312, "debug/reference_rejected_logps": -164.1259765625, "epoch": 0.7560975609756098, "grad_norm": 8.534988911622145, "learning_rate": 1e-06, "logits/chosen": -0.9950529932975769, "logits/rejected": -0.9336649179458618, "logps/chosen": -167.46485900878906, "logps/rejected": -161.46160888671875, "loss": 0.4947, "rewards/accuracies": 0.375, "rewards/chosen": 0.02546636573970318, "rewards/margins": -0.0011772820726037025, "rewards/rejected": 0.026643646880984306, "step": 31 }, { "debug/policy_chosen_logits": -0.99274080991745, "debug/policy_chosen_logps": -141.17715454101562, "debug/policy_rejected_logits": -0.9643717408180237, "debug/policy_rejected_logps": -163.40673828125, "debug/reference_chosen_logps": -142.4697723388672, "debug/reference_rejected_logps": -163.57907104492188, "epoch": 0.7804878048780488, "grad_norm": 8.868022634011657, "learning_rate": 1e-06, "logits/chosen": -0.99274080991745, "logits/rejected": -0.9643717408180237, "logps/chosen": -141.17715454101562, "logps/rejected": -163.40673828125, "loss": 0.4887, "rewards/accuracies": 0.625, "rewards/chosen": 0.012926094233989716, "rewards/margins": 0.01120278425514698, "rewards/rejected": 0.0017233085818588734, "step": 32 }, { "debug/policy_chosen_logits": -0.7678695321083069, "debug/policy_chosen_logps": -179.30178833007812, "debug/policy_rejected_logits": -0.9433090686798096, "debug/policy_rejected_logps": -166.5570068359375, "debug/reference_chosen_logps": -178.72622680664062, "debug/reference_rejected_logps": -164.87799072265625, "epoch": 0.8048780487804879, "grad_norm": 8.118291328693896, "learning_rate": 1e-06, "logits/chosen": -0.7678695321083069, "logits/rejected": -0.9433090686798096, "logps/chosen": -179.30178833007812, "logps/rejected": -166.5570068359375, "loss": 0.4949, "rewards/accuracies": 0.625, "rewards/chosen": -0.005755499936640263, "rewards/margins": 0.011034755036234856, "rewards/rejected": -0.016790255904197693, "step": 33 }, { "debug/policy_chosen_logits": -0.9250268936157227, "debug/policy_chosen_logps": -174.24758911132812, "debug/policy_rejected_logits": -0.8358609676361084, "debug/policy_rejected_logps": -207.80772399902344, "debug/reference_chosen_logps": -174.22793579101562, "debug/reference_rejected_logps": -206.686279296875, "epoch": 0.8292682926829268, "grad_norm": 9.100751206625086, "learning_rate": 1e-06, "logits/chosen": -0.9250268936157227, "logits/rejected": -0.8358609676361084, "logps/chosen": -174.24758911132812, "logps/rejected": -207.80772399902344, "loss": 0.5021, "rewards/accuracies": 0.75, "rewards/chosen": -0.00019640009850263596, "rewards/margins": 0.01101800799369812, "rewards/rejected": -0.01121440902352333, "step": 34 }, { "debug/policy_chosen_logits": -0.9135526418685913, "debug/policy_chosen_logps": -159.21298217773438, "debug/policy_rejected_logits": -1.1572299003601074, "debug/policy_rejected_logps": -147.13705444335938, "debug/reference_chosen_logps": -159.4290771484375, "debug/reference_rejected_logps": -145.97109985351562, "epoch": 0.8536585365853658, "grad_norm": 8.661910409548431, "learning_rate": 1e-06, "logits/chosen": -0.9135526418685913, "logits/rejected": -1.1572299003601074, "logps/chosen": -159.21298217773438, "logps/rejected": -147.13705444335938, "loss": 0.4925, "rewards/accuracies": 0.5, "rewards/chosen": 0.0021607973612844944, "rewards/margins": 0.013820314779877663, "rewards/rejected": -0.011659517884254456, "step": 35 }, { "debug/policy_chosen_logits": -0.706568717956543, "debug/policy_chosen_logps": -185.29225158691406, "debug/policy_rejected_logits": -0.772286593914032, "debug/policy_rejected_logps": -181.357177734375, "debug/reference_chosen_logps": -186.060302734375, "debug/reference_rejected_logps": -181.81985473632812, "epoch": 0.8780487804878049, "grad_norm": 8.794546491268, "learning_rate": 1e-06, "logits/chosen": -0.706568717956543, "logits/rejected": -0.772286593914032, "logps/chosen": -185.29225158691406, "logps/rejected": -181.357177734375, "loss": 0.4963, "rewards/accuracies": 0.625, "rewards/chosen": 0.007680453825742006, "rewards/margins": 0.0030536362901329994, "rewards/rejected": 0.004626817535609007, "step": 36 }, { "debug/policy_chosen_logits": -0.9353126883506775, "debug/policy_chosen_logps": -148.95077514648438, "debug/policy_rejected_logits": -0.8417949080467224, "debug/policy_rejected_logps": -192.06149291992188, "debug/reference_chosen_logps": -148.9204864501953, "debug/reference_rejected_logps": -192.1736297607422, "epoch": 0.9024390243902439, "grad_norm": 9.147337941113607, "learning_rate": 1e-06, "logits/chosen": -0.9353126883506775, "logits/rejected": -0.8417949080467224, "logps/chosen": -148.95077514648438, "logps/rejected": -192.06149291992188, "loss": 0.4959, "rewards/accuracies": 0.625, "rewards/chosen": -0.00030298251658678055, "rewards/margins": -0.001424331683665514, "rewards/rejected": 0.0011213491670787334, "step": 37 }, { "debug/policy_chosen_logits": -0.9579962491989136, "debug/policy_chosen_logps": -145.99325561523438, "debug/policy_rejected_logits": -0.771964430809021, "debug/policy_rejected_logps": -181.51898193359375, "debug/reference_chosen_logps": -146.1923828125, "debug/reference_rejected_logps": -181.7994384765625, "epoch": 0.926829268292683, "grad_norm": 8.770906218078656, "learning_rate": 1e-06, "logits/chosen": -0.9579962491989136, "logits/rejected": -0.771964430809021, "logps/chosen": -145.99325561523438, "logps/rejected": -181.51898193359375, "loss": 0.5028, "rewards/accuracies": 0.375, "rewards/chosen": 0.0019913101568818092, "rewards/margins": -0.0008130739443004131, "rewards/rejected": 0.0028043838683515787, "step": 38 }, { "debug/policy_chosen_logits": -0.7748340964317322, "debug/policy_chosen_logps": -192.96409606933594, "debug/policy_rejected_logits": -0.7723251581192017, "debug/policy_rejected_logps": -193.10032653808594, "debug/reference_chosen_logps": -194.00680541992188, "debug/reference_rejected_logps": -195.18478393554688, "epoch": 0.9512195121951219, "grad_norm": 9.455831322622839, "learning_rate": 1e-06, "logits/chosen": -0.7748340964317322, "logits/rejected": -0.7723251581192017, "logps/chosen": -192.96409606933594, "logps/rejected": -193.10032653808594, "loss": 0.4989, "rewards/accuracies": 0.375, "rewards/chosen": 0.010427169501781464, "rewards/margins": -0.010417431592941284, "rewards/rejected": 0.020844601094722748, "step": 39 }, { "debug/policy_chosen_logits": -0.9706252813339233, "debug/policy_chosen_logps": -156.927490234375, "debug/policy_rejected_logits": -0.8014242649078369, "debug/policy_rejected_logps": -197.02548217773438, "debug/reference_chosen_logps": -155.9664764404297, "debug/reference_rejected_logps": -197.78335571289062, "epoch": 0.975609756097561, "grad_norm": 9.78790232486535, "learning_rate": 1e-06, "logits/chosen": -0.9706252813339233, "logits/rejected": -0.8014242649078369, "logps/chosen": -156.927490234375, "logps/rejected": -197.02548217773438, "loss": 0.5018, "rewards/accuracies": 0.25, "rewards/chosen": -0.009610195644199848, "rewards/margins": -0.017188798636198044, "rewards/rejected": 0.007578602526336908, "step": 40 }, { "debug/policy_chosen_logits": -0.7953402400016785, "debug/policy_chosen_logps": -170.52337646484375, "debug/policy_rejected_logits": -0.8171042799949646, "debug/policy_rejected_logps": -175.24819946289062, "debug/reference_chosen_logps": -169.29534912109375, "debug/reference_rejected_logps": -173.3860626220703, "epoch": 1.0, "grad_norm": 10.67828699749108, "learning_rate": 1e-06, "logits/chosen": -0.7953402400016785, "logits/rejected": -0.8171042799949646, "logps/chosen": -170.52337646484375, "logps/rejected": -175.24819946289062, "loss": 0.4925, "rewards/accuracies": 0.625, "rewards/chosen": -0.01228022575378418, "rewards/margins": 0.006341142114251852, "rewards/rejected": -0.01862136647105217, "step": 41 }, { "epoch": 1.0, "step": 41, "total_flos": 0.0, "train_loss": 0.49933575202779074, "train_runtime": 145.5342, "train_samples_per_second": 17.989, "train_steps_per_second": 0.282 } ], "logging_steps": 1, "max_steps": 41, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 8, "trial_name": null, "trial_params": null }