{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 50, "global_step": 436, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.022935779816513763, "grad_norm": 5.604829014017037, "learning_rate": 1.1363636363636363e-07, "logits/chosen": -2.619293689727783, "logits/rejected": -2.5524563789367676, "logps/chosen": -265.4638977050781, "logps/rejected": -236.136962890625, "loss": 0.1942, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -0.00018078314315062016, "rewards/margins": -0.00011108322360087186, "rewards/rejected": -6.969989772187546e-05, "step": 10 }, { "epoch": 0.045871559633027525, "grad_norm": 5.250583616242791, "learning_rate": 2.2727272727272726e-07, "logits/chosen": -2.6577353477478027, "logits/rejected": -2.575864315032959, "logps/chosen": -298.8517150878906, "logps/rejected": -274.293212890625, "loss": 0.1959, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -0.0006643417873419821, "rewards/margins": 0.0011681303149089217, "rewards/rejected": -0.0018324721604585648, "step": 20 }, { "epoch": 0.06880733944954129, "grad_norm": 4.980077918182304, "learning_rate": 3.4090909090909085e-07, "logits/chosen": -2.675644636154175, "logits/rejected": -2.6021487712860107, "logps/chosen": -290.4170227050781, "logps/rejected": -234.3779296875, "loss": 0.1923, "rewards/accuracies": 0.675000011920929, "rewards/chosen": 0.009157607331871986, "rewards/margins": 0.013118197210133076, "rewards/rejected": -0.003960589878261089, "step": 30 }, { "epoch": 0.09174311926605505, "grad_norm": 5.215480488397355, "learning_rate": 4.545454545454545e-07, "logits/chosen": -2.660599946975708, "logits/rejected": -2.6108055114746094, "logps/chosen": -281.02667236328125, "logps/rejected": -267.7845764160156, "loss": 0.1835, "rewards/accuracies": 0.675000011920929, "rewards/chosen": 0.04142485558986664, "rewards/margins": 0.04134037345647812, "rewards/rejected": 8.447784784948453e-05, "step": 40 }, { "epoch": 0.11467889908256881, "grad_norm": 5.6967811731593905, "learning_rate": 4.997110275491701e-07, "logits/chosen": -2.619861602783203, "logits/rejected": -2.6124091148376465, "logps/chosen": -293.50762939453125, "logps/rejected": -303.9714660644531, "loss": 0.1732, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": 0.027435380965471268, "rewards/margins": 0.0753081664443016, "rewards/rejected": -0.04787278175354004, "step": 50 }, { "epoch": 0.11467889908256881, "eval_logits/chosen": -2.5723471641540527, "eval_logits/rejected": -2.49267315864563, "eval_logps/chosen": -283.6944580078125, "eval_logps/rejected": -257.6913146972656, "eval_loss": 0.1639617681503296, "eval_rewards/accuracies": 0.7068965435028076, "eval_rewards/chosen": 0.013956375420093536, "eval_rewards/margins": 0.12564486265182495, "eval_rewards/rejected": -0.11168847978115082, "eval_runtime": 95.9923, "eval_samples_per_second": 18.939, "eval_steps_per_second": 0.302, "step": 50 }, { "epoch": 0.13761467889908258, "grad_norm": 7.248589740263705, "learning_rate": 4.979475034558115e-07, "logits/chosen": -2.5621228218078613, "logits/rejected": -2.503438949584961, "logps/chosen": -293.131103515625, "logps/rejected": -272.94793701171875, "loss": 0.1592, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.03646404668688774, "rewards/margins": 0.11744984239339828, "rewards/rejected": -0.15391388535499573, "step": 60 }, { "epoch": 0.16055045871559634, "grad_norm": 7.653772817525038, "learning_rate": 4.945923025551788e-07, "logits/chosen": -2.4572176933288574, "logits/rejected": -2.407254219055176, "logps/chosen": -327.6614685058594, "logps/rejected": -284.7535705566406, "loss": 0.1557, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -0.0009046800551004708, "rewards/margins": 0.23255476355552673, "rewards/rejected": -0.23345942795276642, "step": 70 }, { "epoch": 0.1834862385321101, "grad_norm": 8.127224192262021, "learning_rate": 4.896669632591651e-07, "logits/chosen": -2.3246560096740723, "logits/rejected": -2.2035281658172607, "logps/chosen": -297.11651611328125, "logps/rejected": -290.16302490234375, "loss": 0.1475, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -0.11266676336526871, "rewards/margins": 0.2624030113220215, "rewards/rejected": -0.3750697076320648, "step": 80 }, { "epoch": 0.20642201834862386, "grad_norm": 8.777001707426516, "learning_rate": 4.832031033425662e-07, "logits/chosen": -1.9738223552703857, "logits/rejected": -1.9115241765975952, "logps/chosen": -299.46820068359375, "logps/rejected": -291.00994873046875, "loss": 0.1449, "rewards/accuracies": 0.71875, "rewards/chosen": -0.11874761432409286, "rewards/margins": 0.3233392536640167, "rewards/rejected": -0.4420868456363678, "step": 90 }, { "epoch": 0.22935779816513763, "grad_norm": 9.678897311244762, "learning_rate": 4.752422169756047e-07, "logits/chosen": -1.0515625476837158, "logits/rejected": -1.0063254833221436, "logps/chosen": -311.99945068359375, "logps/rejected": -342.3149719238281, "loss": 0.1412, "rewards/accuracies": 0.6875, "rewards/chosen": -0.39574748277664185, "rewards/margins": 0.3081180453300476, "rewards/rejected": -0.7038655281066895, "step": 100 }, { "epoch": 0.22935779816513763, "eval_logits/chosen": -1.1028852462768555, "eval_logits/rejected": -0.8337642550468445, "eval_logps/chosen": -341.0970458984375, "eval_logps/rejected": -340.4026794433594, "eval_loss": 0.13921599090099335, "eval_rewards/accuracies": 0.7198275923728943, "eval_rewards/chosen": -0.5600696802139282, "eval_rewards/margins": 0.3787323534488678, "eval_rewards/rejected": -0.9388020038604736, "eval_runtime": 95.7518, "eval_samples_per_second": 18.987, "eval_steps_per_second": 0.303, "step": 100 }, { "epoch": 0.25229357798165136, "grad_norm": 8.075119669591446, "learning_rate": 4.658354083558188e-07, "logits/chosen": -1.309912085533142, "logits/rejected": -1.0928936004638672, "logps/chosen": -323.1109313964844, "logps/rejected": -320.5851135253906, "loss": 0.1371, "rewards/accuracies": 0.71875, "rewards/chosen": -0.5792280435562134, "rewards/margins": 0.2892912030220032, "rewards/rejected": -0.868519127368927, "step": 110 }, { "epoch": 0.27522935779816515, "grad_norm": 11.527192184915142, "learning_rate": 4.550430636492389e-07, "logits/chosen": -1.2554824352264404, "logits/rejected": -1.2111155986785889, "logps/chosen": -315.65582275390625, "logps/rejected": -323.9941101074219, "loss": 0.1347, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -0.39836472272872925, "rewards/margins": 0.3269372582435608, "rewards/rejected": -0.7253020405769348, "step": 120 }, { "epoch": 0.2981651376146789, "grad_norm": 10.253713539855786, "learning_rate": 4.429344633468004e-07, "logits/chosen": -0.9448979496955872, "logits/rejected": -0.6800575256347656, "logps/chosen": -294.2066345214844, "logps/rejected": -319.7179870605469, "loss": 0.1367, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.38026565313339233, "rewards/margins": 0.39078813791275024, "rewards/rejected": -0.7710537910461426, "step": 130 }, { "epoch": 0.3211009174311927, "grad_norm": 11.166916308293251, "learning_rate": 4.2958733752443187e-07, "logits/chosen": -0.8638193011283875, "logits/rejected": -0.5810004472732544, "logps/chosen": -307.26165771484375, "logps/rejected": -287.42547607421875, "loss": 0.134, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -0.3107650578022003, "rewards/margins": 0.3205246925354004, "rewards/rejected": -0.6312897801399231, "step": 140 }, { "epoch": 0.3440366972477064, "grad_norm": 10.491465778765697, "learning_rate": 4.150873668617898e-07, "logits/chosen": -0.785676121711731, "logits/rejected": -0.4156356453895569, "logps/chosen": -306.7540588378906, "logps/rejected": -312.4991149902344, "loss": 0.1327, "rewards/accuracies": 0.71875, "rewards/chosen": -0.40210476517677307, "rewards/margins": 0.39101067185401917, "rewards/rejected": -0.793115496635437, "step": 150 }, { "epoch": 0.3440366972477064, "eval_logits/chosen": -0.934603214263916, "eval_logits/rejected": -0.4213140308856964, "eval_logps/chosen": -321.6920166015625, "eval_logps/rejected": -333.3775939941406, "eval_loss": 0.1256996989250183, "eval_rewards/accuracies": 0.7241379022598267, "eval_rewards/chosen": -0.36601918935775757, "eval_rewards/margins": 0.5025321841239929, "eval_rewards/rejected": -0.8685513734817505, "eval_runtime": 95.7427, "eval_samples_per_second": 18.988, "eval_steps_per_second": 0.303, "step": 150 }, { "epoch": 0.3669724770642202, "grad_norm": 10.606572226202415, "learning_rate": 3.9952763262280397e-07, "logits/chosen": -0.6937485933303833, "logits/rejected": -0.39156150817871094, "logps/chosen": -341.2958068847656, "logps/rejected": -376.64788818359375, "loss": 0.1226, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -0.4474472105503082, "rewards/margins": 0.4593452513217926, "rewards/rejected": -0.9067924618721008, "step": 160 }, { "epoch": 0.38990825688073394, "grad_norm": 7.909933445936232, "learning_rate": 3.8300801912883414e-07, "logits/chosen": -0.5848881602287292, "logits/rejected": -0.295288622379303, "logps/chosen": -314.4745178222656, "logps/rejected": -355.83209228515625, "loss": 0.1194, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.42873620986938477, "rewards/margins": 0.4288958013057709, "rewards/rejected": -0.857632040977478, "step": 170 }, { "epoch": 0.41284403669724773, "grad_norm": 9.296204512324422, "learning_rate": 3.6563457256020884e-07, "logits/chosen": -0.4405382573604584, "logits/rejected": 0.17199954390525818, "logps/chosen": -351.5772399902344, "logps/rejected": -332.4178771972656, "loss": 0.1256, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.48219355940818787, "rewards/margins": 0.4570356011390686, "rewards/rejected": -0.9392291903495789, "step": 180 }, { "epoch": 0.43577981651376146, "grad_norm": 9.788859469537575, "learning_rate": 3.475188202022617e-07, "logits/chosen": -0.9616500735282898, "logits/rejected": -0.5780771970748901, "logps/chosen": -284.89788818359375, "logps/rejected": -339.87139892578125, "loss": 0.124, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -0.2678338885307312, "rewards/margins": 0.487435519695282, "rewards/rejected": -0.7552694082260132, "step": 190 }, { "epoch": 0.45871559633027525, "grad_norm": 8.31792118291438, "learning_rate": 3.287770545059052e-07, "logits/chosen": -0.9191502332687378, "logits/rejected": -0.3312894105911255, "logps/chosen": -313.3896179199219, "logps/rejected": -324.57000732421875, "loss": 0.1242, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -0.3510800302028656, "rewards/margins": 0.4560603201389313, "rewards/rejected": -0.8071403503417969, "step": 200 }, { "epoch": 0.45871559633027525, "eval_logits/chosen": -0.9084411859512329, "eval_logits/rejected": -0.1497526615858078, "eval_logps/chosen": -319.3252258300781, "eval_logps/rejected": -331.5806884765625, "eval_loss": 0.11925708502531052, "eval_rewards/accuracies": 0.7456896305084229, "eval_rewards/chosen": -0.34235113859176636, "eval_rewards/margins": 0.5082312226295471, "eval_rewards/rejected": -0.8505823612213135, "eval_runtime": 93.7494, "eval_samples_per_second": 19.392, "eval_steps_per_second": 0.309, "step": 200 }, { "epoch": 0.481651376146789, "grad_norm": 7.584339731807936, "learning_rate": 3.0952958655864954e-07, "logits/chosen": -0.6695399284362793, "logits/rejected": -0.2294241189956665, "logps/chosen": -311.20343017578125, "logps/rejected": -338.2936096191406, "loss": 0.1173, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -0.3789902925491333, "rewards/margins": 0.4605233073234558, "rewards/rejected": -0.8395135998725891, "step": 210 }, { "epoch": 0.5045871559633027, "grad_norm": 10.1245037946263, "learning_rate": 2.898999737583448e-07, "logits/chosen": -0.6778780221939087, "logits/rejected": 0.15124213695526123, "logps/chosen": -365.30010986328125, "logps/rejected": -385.3759460449219, "loss": 0.1146, "rewards/accuracies": 0.793749988079071, "rewards/chosen": -0.42619770765304565, "rewards/margins": 0.5778595209121704, "rewards/rejected": -1.0040571689605713, "step": 220 }, { "epoch": 0.5275229357798165, "grad_norm": 8.611834221562093, "learning_rate": 2.7001422664752333e-07, "logits/chosen": -0.5495241284370422, "logits/rejected": -0.10778895765542984, "logps/chosen": -308.42864990234375, "logps/rejected": -350.29937744140625, "loss": 0.1211, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -0.4048517346382141, "rewards/margins": 0.4419426918029785, "rewards/rejected": -0.8467944264411926, "step": 230 }, { "epoch": 0.5504587155963303, "grad_norm": 8.84744756928296, "learning_rate": 2.5e-07, "logits/chosen": -0.8606957197189331, "logits/rejected": -0.25019243359565735, "logps/chosen": -323.67840576171875, "logps/rejected": -341.2455749511719, "loss": 0.1198, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -0.3614657521247864, "rewards/margins": 0.4374900758266449, "rewards/rejected": -0.7989557981491089, "step": 240 }, { "epoch": 0.573394495412844, "grad_norm": 8.762072273295908, "learning_rate": 2.2998577335247667e-07, "logits/chosen": -0.8679742813110352, "logits/rejected": -0.18818725645542145, "logps/chosen": -340.2317810058594, "logps/rejected": -349.81414794921875, "loss": 0.118, "rewards/accuracies": 0.78125, "rewards/chosen": -0.3643796145915985, "rewards/margins": 0.5293234586715698, "rewards/rejected": -0.8937031030654907, "step": 250 }, { "epoch": 0.573394495412844, "eval_logits/chosen": -0.8147245645523071, "eval_logits/rejected": -0.025059914216399193, "eval_logps/chosen": -322.4342041015625, "eval_logps/rejected": -343.3682556152344, "eval_loss": 0.11674168705940247, "eval_rewards/accuracies": 0.7543103694915771, "eval_rewards/chosen": -0.3734411895275116, "eval_rewards/margins": 0.5950165390968323, "eval_rewards/rejected": -0.9684576988220215, "eval_runtime": 95.6284, "eval_samples_per_second": 19.011, "eval_steps_per_second": 0.303, "step": 250 }, { "epoch": 0.5963302752293578, "grad_norm": 8.055173398595795, "learning_rate": 2.1010002624165524e-07, "logits/chosen": -0.5792064666748047, "logits/rejected": -0.028709029778838158, "logps/chosen": -319.5377197265625, "logps/rejected": -383.94610595703125, "loss": 0.1186, "rewards/accuracies": 0.78125, "rewards/chosen": -0.4882320463657379, "rewards/margins": 0.6072918176651001, "rewards/rejected": -1.0955239534378052, "step": 260 }, { "epoch": 0.6192660550458715, "grad_norm": 9.235774055264796, "learning_rate": 1.9047041344135043e-07, "logits/chosen": -0.37728264927864075, "logits/rejected": 0.017757948487997055, "logps/chosen": -318.92266845703125, "logps/rejected": -360.00250244140625, "loss": 0.1169, "rewards/accuracies": 0.8062499761581421, "rewards/chosen": -0.5286036729812622, "rewards/margins": 0.5529505014419556, "rewards/rejected": -1.0815541744232178, "step": 270 }, { "epoch": 0.6422018348623854, "grad_norm": 9.286061693337006, "learning_rate": 1.7122294549409482e-07, "logits/chosen": -0.7753764986991882, "logits/rejected": -0.038079630583524704, "logps/chosen": -314.3120422363281, "logps/rejected": -365.4776611328125, "loss": 0.1216, "rewards/accuracies": 0.78125, "rewards/chosen": -0.4131316542625427, "rewards/margins": 0.6036869883537292, "rewards/rejected": -1.016818642616272, "step": 280 }, { "epoch": 0.6651376146788991, "grad_norm": 10.391059671359317, "learning_rate": 1.524811797977383e-07, "logits/chosen": -0.6743412017822266, "logits/rejected": -0.05127597600221634, "logps/chosen": -326.2575378417969, "logps/rejected": -345.9309387207031, "loss": 0.1144, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -0.3852667212486267, "rewards/margins": 0.5348750352859497, "rewards/rejected": -0.9201416969299316, "step": 290 }, { "epoch": 0.6880733944954128, "grad_norm": 11.142166437672913, "learning_rate": 1.3436542743979125e-07, "logits/chosen": -0.6289922595024109, "logits/rejected": -0.2742362916469574, "logps/chosen": -350.96099853515625, "logps/rejected": -345.9538879394531, "loss": 0.1236, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -0.45011693239212036, "rewards/margins": 0.42082324624061584, "rewards/rejected": -0.8709400296211243, "step": 300 }, { "epoch": 0.6880733944954128, "eval_logits/chosen": -0.5284780263900757, "eval_logits/rejected": 0.2890760600566864, "eval_logps/chosen": -332.8935546875, "eval_logps/rejected": -351.01617431640625, "eval_loss": 0.11446455121040344, "eval_rewards/accuracies": 0.7629310488700867, "eval_rewards/chosen": -0.4780345857143402, "eval_rewards/margins": 0.5669029355049133, "eval_rewards/rejected": -1.0449376106262207, "eval_runtime": 94.2008, "eval_samples_per_second": 19.299, "eval_steps_per_second": 0.308, "step": 300 }, { "epoch": 0.7110091743119266, "grad_norm": 9.249175988688211, "learning_rate": 1.1699198087116588e-07, "logits/chosen": -0.5085445642471313, "logits/rejected": 0.12936155498027802, "logps/chosen": -332.48468017578125, "logps/rejected": -366.32489013671875, "loss": 0.1229, "rewards/accuracies": 0.75, "rewards/chosen": -0.5451949238777161, "rewards/margins": 0.4535568654537201, "rewards/rejected": -0.9987518191337585, "step": 310 }, { "epoch": 0.7339449541284404, "grad_norm": 9.319929398035963, "learning_rate": 1.00472367377196e-07, "logits/chosen": -0.37469834089279175, "logits/rejected": 0.34990087151527405, "logps/chosen": -323.309326171875, "logps/rejected": -343.5829772949219, "loss": 0.1184, "rewards/accuracies": 0.78125, "rewards/chosen": -0.49077337980270386, "rewards/margins": 0.6001543402671814, "rewards/rejected": -1.0909278392791748, "step": 320 }, { "epoch": 0.7568807339449541, "grad_norm": 10.87259656283017, "learning_rate": 8.49126331382102e-08, "logits/chosen": -0.20644304156303406, "logits/rejected": 0.2470076084136963, "logps/chosen": -329.26171875, "logps/rejected": -346.26666259765625, "loss": 0.1182, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.5473949909210205, "rewards/margins": 0.44427090883255005, "rewards/rejected": -0.9916658401489258, "step": 330 }, { "epoch": 0.7798165137614679, "grad_norm": 11.502524720737265, "learning_rate": 7.041266247556812e-08, "logits/chosen": -0.511077344417572, "logits/rejected": -0.07181791216135025, "logps/chosen": -338.0530700683594, "logps/rejected": -354.2392578125, "loss": 0.1129, "rewards/accuracies": 0.78125, "rewards/chosen": -0.5006048679351807, "rewards/margins": 0.4419478476047516, "rewards/rejected": -0.9425527453422546, "step": 340 }, { "epoch": 0.8027522935779816, "grad_norm": 10.417951906388708, "learning_rate": 5.706553665319955e-08, "logits/chosen": -0.4384083151817322, "logits/rejected": 0.22936472296714783, "logps/chosen": -327.1330261230469, "logps/rejected": -339.122314453125, "loss": 0.122, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -0.526419460773468, "rewards/margins": 0.48818930983543396, "rewards/rejected": -1.0146087408065796, "step": 350 }, { "epoch": 0.8027522935779816, "eval_logits/chosen": -0.48056575655937195, "eval_logits/rejected": 0.33971619606018066, "eval_logps/chosen": -330.8570251464844, "eval_logps/rejected": -350.28302001953125, "eval_loss": 0.11267489194869995, "eval_rewards/accuracies": 0.767241358757019, "eval_rewards/chosen": -0.45766934752464294, "eval_rewards/margins": 0.5799364447593689, "eval_rewards/rejected": -1.0376057624816895, "eval_runtime": 94.5966, "eval_samples_per_second": 19.218, "eval_steps_per_second": 0.307, "step": 350 }, { "epoch": 0.8256880733944955, "grad_norm": 8.120253834632708, "learning_rate": 4.4956936350761005e-08, "logits/chosen": -0.412276029586792, "logits/rejected": 0.03293975815176964, "logps/chosen": -295.88397216796875, "logps/rejected": -349.67828369140625, "loss": 0.1126, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -0.43563956022262573, "rewards/margins": 0.47577181458473206, "rewards/rejected": -0.9114114046096802, "step": 360 }, { "epoch": 0.8486238532110092, "grad_norm": 8.937083414679394, "learning_rate": 3.416459164418123e-08, "logits/chosen": -0.7006546258926392, "logits/rejected": -0.1087837815284729, "logps/chosen": -344.0120544433594, "logps/rejected": -357.1898498535156, "loss": 0.1124, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -0.4442760944366455, "rewards/margins": 0.5018762350082397, "rewards/rejected": -0.9461522102355957, "step": 370 }, { "epoch": 0.8715596330275229, "grad_norm": 9.250415689561605, "learning_rate": 2.475778302439524e-08, "logits/chosen": -0.6538180112838745, "logits/rejected": 0.11514680087566376, "logps/chosen": -334.7962951660156, "logps/rejected": -353.5735778808594, "loss": 0.1165, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -0.3841826617717743, "rewards/margins": 0.5749909281730652, "rewards/rejected": -0.9591735601425171, "step": 380 }, { "epoch": 0.8944954128440367, "grad_norm": 9.532136069213038, "learning_rate": 1.6796896657433805e-08, "logits/chosen": -0.4097678065299988, "logits/rejected": 0.13332059979438782, "logps/chosen": -294.68280029296875, "logps/rejected": -319.02825927734375, "loss": 0.122, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -0.4272535443305969, "rewards/margins": 0.4699665606021881, "rewards/rejected": -0.8972201347351074, "step": 390 }, { "epoch": 0.9174311926605505, "grad_norm": 9.826819983657527, "learning_rate": 1.0333036740834855e-08, "logits/chosen": -0.3362836241722107, "logits/rejected": 0.09532846510410309, "logps/chosen": -262.51324462890625, "logps/rejected": -315.52349853515625, "loss": 0.12, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -0.37953639030456543, "rewards/margins": 0.48354291915893555, "rewards/rejected": -0.8630792498588562, "step": 400 }, { "epoch": 0.9174311926605505, "eval_logits/chosen": -0.6159119606018066, "eval_logits/rejected": 0.19038529694080353, "eval_logps/chosen": -325.2745361328125, "eval_logps/rejected": -342.9399108886719, "eval_loss": 0.11256735771894455, "eval_rewards/accuracies": 0.75, "eval_rewards/chosen": -0.40184491872787476, "eval_rewards/margins": 0.5623298287391663, "eval_rewards/rejected": -0.9641746282577515, "eval_runtime": 93.497, "eval_samples_per_second": 19.444, "eval_steps_per_second": 0.31, "step": 400 }, { "epoch": 0.9403669724770642, "grad_norm": 9.708164546519836, "learning_rate": 5.4076974448211685e-09, "logits/chosen": -0.34120887517929077, "logits/rejected": 0.403463214635849, "logps/chosen": -314.7442321777344, "logps/rejected": -337.3968811035156, "loss": 0.1168, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -0.43792352080345154, "rewards/margins": 0.5555781126022339, "rewards/rejected": -0.9935015439987183, "step": 410 }, { "epoch": 0.963302752293578, "grad_norm": 10.889878191306106, "learning_rate": 2.052496544188487e-09, "logits/chosen": -0.31919345259666443, "logits/rejected": 0.3534119725227356, "logps/chosen": -306.3693542480469, "logps/rejected": -349.92462158203125, "loss": 0.1147, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -0.4693039059638977, "rewards/margins": 0.5711686015129089, "rewards/rejected": -1.040472388267517, "step": 420 }, { "epoch": 0.9862385321100917, "grad_norm": 9.153719920278654, "learning_rate": 2.889724508297886e-10, "logits/chosen": -0.556784451007843, "logits/rejected": 0.2818067669868469, "logps/chosen": -345.05450439453125, "logps/rejected": -335.64605712890625, "loss": 0.1155, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -0.41879063844680786, "rewards/margins": 0.49730420112609863, "rewards/rejected": -0.9160947799682617, "step": 430 }, { "epoch": 1.0, "step": 436, "total_flos": 0.0, "train_loss": 0.13197535372108493, "train_runtime": 11523.0815, "train_samples_per_second": 4.839, "train_steps_per_second": 0.038 } ], "logging_steps": 10, "max_steps": 436, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 100, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 8, "trial_name": null, "trial_params": null }