{ "best_metric": null, "best_model_checkpoint": null, "epoch": 2.9952, "eval_steps": 62, "global_step": 936, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0032, "grad_norm": 36.312375162198066, "learning_rate": 5.3191489361702125e-09, "logits/generated": -1.0505484342575073, "logits/real": -0.9055266976356506, "logps/generated": -261.58599853515625, "logps/real": -286.2762145996094, "loss": 0.9514, "rewards/accuracies": 0.0, "rewards/generated": 0.0, "rewards/margins": 0.0, "rewards/real": 0.0, "step": 1 }, { "epoch": 0.032, "grad_norm": 39.42726550261653, "learning_rate": 5.3191489361702123e-08, "logits/generated": -0.8884822726249695, "logits/real": -0.8766762614250183, "logps/generated": -277.482421875, "logps/real": -271.2518005371094, "loss": 0.9576, "rewards/accuracies": 0.4583333432674408, "rewards/generated": 0.007981490343809128, "rewards/margins": -0.007828797213733196, "rewards/real": 0.00015269446885213256, "step": 10 }, { "epoch": 0.064, "grad_norm": 69.85409102610488, "learning_rate": 1.0638297872340425e-07, "logits/generated": -0.923265814781189, "logits/real": -0.9057778120040894, "logps/generated": -280.7295227050781, "logps/real": -263.83477783203125, "loss": 0.9533, "rewards/accuracies": 0.48750001192092896, "rewards/generated": 0.025133345276117325, "rewards/margins": -0.009676715359091759, "rewards/real": 0.015456628985702991, "step": 20 }, { "epoch": 0.096, "grad_norm": 46.85150894675461, "learning_rate": 1.5957446808510638e-07, "logits/generated": -0.8915747404098511, "logits/real": -0.8856374025344849, "logps/generated": -284.23211669921875, "logps/real": -262.6820373535156, "loss": 0.948, "rewards/accuracies": 0.512499988079071, "rewards/generated": 0.026525095105171204, "rewards/margins": 0.009968547150492668, "rewards/real": 0.03649364411830902, "step": 30 }, { "epoch": 0.128, "grad_norm": 36.76446307714157, "learning_rate": 2.127659574468085e-07, "logits/generated": -0.9389626383781433, "logits/real": -0.8786951303482056, "logps/generated": -285.33123779296875, "logps/real": -269.17254638671875, "loss": 0.9351, "rewards/accuracies": 0.574999988079071, "rewards/generated": 0.06792740523815155, "rewards/margins": 0.0501725971698761, "rewards/real": 0.11810000240802765, "step": 40 }, { "epoch": 0.16, "grad_norm": 38.27523563479763, "learning_rate": 2.659574468085106e-07, "logits/generated": -0.9373289346694946, "logits/real": -0.8907437324523926, "logps/generated": -283.5718688964844, "logps/real": -272.84735107421875, "loss": 0.9338, "rewards/accuracies": 0.612500011920929, "rewards/generated": 0.13439074158668518, "rewards/margins": 0.09475749731063843, "rewards/real": 0.2291482388973236, "step": 50 }, { "epoch": 0.192, "grad_norm": 36.668178530589245, "learning_rate": 3.1914893617021275e-07, "logits/generated": -0.8845487833023071, "logits/real": -0.8273765444755554, "logps/generated": -282.7154541015625, "logps/real": -280.0048828125, "loss": 0.9274, "rewards/accuracies": 0.550000011920929, "rewards/generated": 0.2121720314025879, "rewards/margins": 0.08153967559337616, "rewards/real": 0.29371172189712524, "step": 60 }, { "epoch": 0.1984, "eval_logits/generated": -0.8568925857543945, "eval_logits/real": -0.8220176100730896, "eval_logps/generated": -279.84649658203125, "eval_logps/real": -260.2974853515625, "eval_loss": 0.9071672558784485, "eval_rewards/accuracies": 0.5769230723381042, "eval_rewards/generated": 0.28226110339164734, "eval_rewards/margins": 0.0569092221558094, "eval_rewards/real": 0.33917027711868286, "eval_runtime": 32.6477, "eval_samples_per_second": 6.126, "eval_steps_per_second": 0.398, "step": 62 }, { "epoch": 0.224, "grad_norm": 43.460924607508964, "learning_rate": 3.7234042553191484e-07, "logits/generated": -0.9295558929443359, "logits/real": -0.8886798620223999, "logps/generated": -290.6202087402344, "logps/real": -276.14556884765625, "loss": 0.8986, "rewards/accuracies": 0.675000011920929, "rewards/generated": 0.11747777462005615, "rewards/margins": 0.15664233267307281, "rewards/real": 0.2741200923919678, "step": 70 }, { "epoch": 0.256, "grad_norm": 42.188704294099885, "learning_rate": 4.25531914893617e-07, "logits/generated": -0.8356429934501648, "logits/real": -0.8451374173164368, "logps/generated": -282.71136474609375, "logps/real": -272.5228271484375, "loss": 0.8942, "rewards/accuracies": 0.612500011920929, "rewards/generated": 0.0709092766046524, "rewards/margins": 0.1463131606578827, "rewards/real": 0.2172224223613739, "step": 80 }, { "epoch": 0.288, "grad_norm": 44.785999920785976, "learning_rate": 4.787234042553192e-07, "logits/generated": -0.8496848940849304, "logits/real": -0.8155921697616577, "logps/generated": -285.31524658203125, "logps/real": -269.96636962890625, "loss": 0.8644, "rewards/accuracies": 0.699999988079071, "rewards/generated": 0.14171293377876282, "rewards/margins": 0.23642773926258087, "rewards/real": 0.3781406879425049, "step": 90 }, { "epoch": 0.32, "grad_norm": 32.986622484335804, "learning_rate": 4.96437054631829e-07, "logits/generated": -0.835403323173523, "logits/real": -0.8723608255386353, "logps/generated": -270.5906677246094, "logps/real": -262.37432861328125, "loss": 0.8325, "rewards/accuracies": 0.800000011920929, "rewards/generated": 0.09086887538433075, "rewards/margins": 0.4376682639122009, "rewards/real": 0.5285371541976929, "step": 100 }, { "epoch": 0.352, "grad_norm": 36.64209136154719, "learning_rate": 4.904988123515439e-07, "logits/generated": -0.8128947019577026, "logits/real": -0.8069890737533569, "logps/generated": -281.6543273925781, "logps/real": -274.40447998046875, "loss": 0.8402, "rewards/accuracies": 0.75, "rewards/generated": 0.45714157819747925, "rewards/margins": 0.3994951546192169, "rewards/real": 0.8566367030143738, "step": 110 }, { "epoch": 0.384, "grad_norm": 47.95190828212081, "learning_rate": 4.845605700712589e-07, "logits/generated": -0.8099591135978699, "logits/real": -0.7854174375534058, "logps/generated": -285.89910888671875, "logps/real": -270.67303466796875, "loss": 0.7991, "rewards/accuracies": 0.7749999761581421, "rewards/generated": 0.2720848619937897, "rewards/margins": 0.4240357279777527, "rewards/real": 0.69612056016922, "step": 120 }, { "epoch": 0.3968, "eval_logits/generated": -0.7344560623168945, "eval_logits/real": -0.6920404434204102, "eval_logps/generated": -277.5608215332031, "eval_logps/real": -252.03753662109375, "eval_loss": 0.7609233260154724, "eval_rewards/accuracies": 0.75, "eval_rewards/generated": 0.5108292698860168, "eval_rewards/margins": 0.6543369889259338, "eval_rewards/real": 1.1651662588119507, "eval_runtime": 31.3881, "eval_samples_per_second": 6.372, "eval_steps_per_second": 0.414, "step": 124 }, { "epoch": 0.416, "grad_norm": 42.63444765535438, "learning_rate": 4.786223277909738e-07, "logits/generated": -0.7545986771583557, "logits/real": -0.6923834085464478, "logps/generated": -279.8142395019531, "logps/real": -259.6496887207031, "loss": 0.7655, "rewards/accuracies": 0.699999988079071, "rewards/generated": 0.3708071708679199, "rewards/margins": 0.6119031310081482, "rewards/real": 0.9827102422714233, "step": 130 }, { "epoch": 0.448, "grad_norm": 51.894650836677464, "learning_rate": 4.7268408551068883e-07, "logits/generated": -0.7601853609085083, "logits/real": -0.7050287127494812, "logps/generated": -277.9598693847656, "logps/real": -256.5985107421875, "loss": 0.7557, "rewards/accuracies": 0.75, "rewards/generated": 0.4785175323486328, "rewards/margins": 0.5866004228591919, "rewards/real": 1.0651179552078247, "step": 140 }, { "epoch": 0.48, "grad_norm": 37.772186442312375, "learning_rate": 4.667458432304038e-07, "logits/generated": -0.7380754351615906, "logits/real": -0.7076193690299988, "logps/generated": -276.724853515625, "logps/real": -259.9181213378906, "loss": 0.7664, "rewards/accuracies": 0.824999988079071, "rewards/generated": 0.5476845502853394, "rewards/margins": 0.6281734704971313, "rewards/real": 1.1758581399917603, "step": 150 }, { "epoch": 0.512, "grad_norm": 83.20495744326583, "learning_rate": 4.6080760095011875e-07, "logits/generated": -0.7478477954864502, "logits/real": -0.668219268321991, "logps/generated": -277.27008056640625, "logps/real": -251.16162109375, "loss": 0.7771, "rewards/accuracies": 0.8125, "rewards/generated": 0.6926546096801758, "rewards/margins": 0.8125525712966919, "rewards/real": 1.5052071809768677, "step": 160 }, { "epoch": 0.544, "grad_norm": 83.52524463925606, "learning_rate": 4.548693586698337e-07, "logits/generated": -0.7033058404922485, "logits/real": -0.6332999467849731, "logps/generated": -273.8232421875, "logps/real": -253.6103973388672, "loss": 0.7136, "rewards/accuracies": 0.887499988079071, "rewards/generated": 0.8486793637275696, "rewards/margins": 1.0631884336471558, "rewards/real": 1.9118677377700806, "step": 170 }, { "epoch": 0.576, "grad_norm": 23.946531693681965, "learning_rate": 4.4893111638954866e-07, "logits/generated": -0.7574105262756348, "logits/real": -0.7167672514915466, "logps/generated": -279.35101318359375, "logps/real": -262.0407409667969, "loss": 0.7105, "rewards/accuracies": 0.7749999761581421, "rewards/generated": 0.8605362176895142, "rewards/margins": 0.7797509431838989, "rewards/real": 1.6402870416641235, "step": 180 }, { "epoch": 0.5952, "eval_logits/generated": -0.7211576700210571, "eval_logits/real": -0.675662636756897, "eval_logps/generated": -271.1729736328125, "eval_logps/real": -242.65411376953125, "eval_loss": 0.6947689652442932, "eval_rewards/accuracies": 0.75, "eval_rewards/generated": 1.1496131420135498, "eval_rewards/margins": 0.953899085521698, "eval_rewards/real": 2.1035122871398926, "eval_runtime": 38.7364, "eval_samples_per_second": 5.163, "eval_steps_per_second": 0.336, "step": 186 }, { "epoch": 0.608, "grad_norm": 37.231951592870594, "learning_rate": 4.429928741092636e-07, "logits/generated": -0.6917730569839478, "logits/real": -0.6692991852760315, "logps/generated": -262.572998046875, "logps/real": -254.6962890625, "loss": 0.7013, "rewards/accuracies": 0.7749999761581421, "rewards/generated": 0.902447521686554, "rewards/margins": 0.9576849937438965, "rewards/real": 1.8601325750350952, "step": 190 }, { "epoch": 0.64, "grad_norm": 38.231266643004595, "learning_rate": 4.3705463182897863e-07, "logits/generated": -0.7412205934524536, "logits/real": -0.6870383024215698, "logps/generated": -268.48175048828125, "logps/real": -251.5652313232422, "loss": 0.7171, "rewards/accuracies": 0.7250000238418579, "rewards/generated": 1.273427963256836, "rewards/margins": 0.7382354736328125, "rewards/real": 2.0116631984710693, "step": 200 }, { "epoch": 0.672, "grad_norm": 34.66760610528596, "learning_rate": 4.311163895486936e-07, "logits/generated": -0.7578158378601074, "logits/real": -0.6925237774848938, "logps/generated": -272.41131591796875, "logps/real": -260.09002685546875, "loss": 0.7391, "rewards/accuracies": 0.8125, "rewards/generated": 0.8478690981864929, "rewards/margins": 1.0086314678192139, "rewards/real": 1.8565006256103516, "step": 210 }, { "epoch": 0.704, "grad_norm": 40.79789916082821, "learning_rate": 4.251781472684085e-07, "logits/generated": -0.8165351748466492, "logits/real": -0.7475171089172363, "logps/generated": -270.42596435546875, "logps/real": -249.1069793701172, "loss": 0.7066, "rewards/accuracies": 0.762499988079071, "rewards/generated": 1.1451914310455322, "rewards/margins": 0.7671722769737244, "rewards/real": 1.9123637676239014, "step": 220 }, { "epoch": 0.736, "grad_norm": 33.14310267786424, "learning_rate": 4.192399049881235e-07, "logits/generated": -0.8264646530151367, "logits/real": -0.8473979234695435, "logps/generated": -267.9267578125, "logps/real": -250.99227905273438, "loss": 0.7015, "rewards/accuracies": 0.7749999761581421, "rewards/generated": 0.8048334121704102, "rewards/margins": 0.9929380416870117, "rewards/real": 1.797771692276001, "step": 230 }, { "epoch": 0.768, "grad_norm": 70.66964958047514, "learning_rate": 4.1330166270783846e-07, "logits/generated": -0.8181743621826172, "logits/real": -0.8160572052001953, "logps/generated": -279.4770202636719, "logps/real": -255.7892608642578, "loss": 0.6956, "rewards/accuracies": 0.737500011920929, "rewards/generated": 1.2291936874389648, "rewards/margins": 0.8495155572891235, "rewards/real": 2.078709125518799, "step": 240 }, { "epoch": 0.7936, "eval_logits/generated": -0.7590754628181458, "eval_logits/real": -0.7111349701881409, "eval_logps/generated": -268.5383605957031, "eval_logps/real": -238.238037109375, "eval_loss": 0.6512947082519531, "eval_rewards/accuracies": 0.7692307829856873, "eval_rewards/generated": 1.4130712747573853, "eval_rewards/margins": 1.1320432424545288, "eval_rewards/real": 2.545114755630493, "eval_runtime": 24.5984, "eval_samples_per_second": 8.131, "eval_steps_per_second": 0.528, "step": 248 }, { "epoch": 0.8, "grad_norm": 25.219085946596035, "learning_rate": 4.0736342042755347e-07, "logits/generated": -0.8213751912117004, "logits/real": -0.7533193826675415, "logps/generated": -278.72216796875, "logps/real": -251.270751953125, "loss": 0.6719, "rewards/accuracies": 0.824999988079071, "rewards/generated": 1.1001530885696411, "rewards/margins": 1.4536633491516113, "rewards/real": 2.553816318511963, "step": 250 }, { "epoch": 0.832, "grad_norm": 32.893074664777714, "learning_rate": 4.0142517814726837e-07, "logits/generated": -0.8878507614135742, "logits/real": -0.8324764370918274, "logps/generated": -268.8551330566406, "logps/real": -260.70123291015625, "loss": 0.6836, "rewards/accuracies": 0.7875000238418579, "rewards/generated": 1.157348394393921, "rewards/margins": 0.9511353373527527, "rewards/real": 2.1084837913513184, "step": 260 }, { "epoch": 0.864, "grad_norm": 32.254371217681815, "learning_rate": 3.9548693586698333e-07, "logits/generated": -0.8339499235153198, "logits/real": -0.7133331894874573, "logps/generated": -275.3948974609375, "logps/real": -251.73385620117188, "loss": 0.7136, "rewards/accuracies": 0.800000011920929, "rewards/generated": 1.3555936813354492, "rewards/margins": 1.1759321689605713, "rewards/real": 2.5315260887145996, "step": 270 }, { "epoch": 0.896, "grad_norm": 55.5881070490368, "learning_rate": 3.8954869358669834e-07, "logits/generated": -0.875012218952179, "logits/real": -0.8011924028396606, "logps/generated": -276.5934143066406, "logps/real": -250.3197784423828, "loss": 0.7059, "rewards/accuracies": 0.8125, "rewards/generated": 1.1495393514633179, "rewards/margins": 1.024700403213501, "rewards/real": 2.1742396354675293, "step": 280 }, { "epoch": 0.928, "grad_norm": 72.82954300539197, "learning_rate": 3.836104513064133e-07, "logits/generated": -0.8673734664916992, "logits/real": -0.7817381620407104, "logps/generated": -273.6841735839844, "logps/real": -238.8033447265625, "loss": 0.6955, "rewards/accuracies": 0.800000011920929, "rewards/generated": 1.3869714736938477, "rewards/margins": 1.2342636585235596, "rewards/real": 2.621234893798828, "step": 290 }, { "epoch": 0.96, "grad_norm": 41.89693248697711, "learning_rate": 3.7767220902612825e-07, "logits/generated": -0.8732556104660034, "logits/real": -0.7744352221488953, "logps/generated": -250.3924560546875, "logps/real": -237.4385528564453, "loss": 0.6461, "rewards/accuracies": 0.8125, "rewards/generated": 1.5663549900054932, "rewards/margins": 1.110990285873413, "rewards/real": 2.6773452758789062, "step": 300 }, { "epoch": 0.992, "grad_norm": 55.873556249472585, "learning_rate": 3.717339667458432e-07, "logits/generated": -0.8738036155700684, "logits/real": -0.844822883605957, "logps/generated": -267.45465087890625, "logps/real": -245.611328125, "loss": 0.6502, "rewards/accuracies": 0.862500011920929, "rewards/generated": 1.431753396987915, "rewards/margins": 1.5182833671569824, "rewards/real": 2.9500367641448975, "step": 310 }, { "epoch": 0.992, "eval_logits/generated": -0.8166303634643555, "eval_logits/real": -0.7764740586280823, "eval_logps/generated": -265.8044738769531, "eval_logps/real": -233.75184631347656, "eval_loss": 0.6209976077079773, "eval_rewards/accuracies": 0.8269230723381042, "eval_rewards/generated": 1.6864629983901978, "eval_rewards/margins": 1.307273268699646, "eval_rewards/real": 2.993736505508423, "eval_runtime": 31.0162, "eval_samples_per_second": 6.448, "eval_steps_per_second": 0.419, "step": 310 }, { "epoch": 1.024, "grad_norm": 22.928739523925433, "learning_rate": 3.6579572446555817e-07, "logits/generated": -0.9426137804985046, "logits/real": -0.8645849227905273, "logps/generated": -277.8576965332031, "logps/real": -246.18771362304688, "loss": 0.6107, "rewards/accuracies": 0.887499988079071, "rewards/generated": 1.5208723545074463, "rewards/margins": 1.2970173358917236, "rewards/real": 2.81788969039917, "step": 320 }, { "epoch": 1.056, "grad_norm": 20.529010492791425, "learning_rate": 3.598574821852731e-07, "logits/generated": -0.9135828018188477, "logits/real": -0.8455237150192261, "logps/generated": -273.8128662109375, "logps/real": -237.1934051513672, "loss": 0.5437, "rewards/accuracies": 0.949999988079071, "rewards/generated": 1.4894237518310547, "rewards/margins": 1.611843466758728, "rewards/real": 3.1012673377990723, "step": 330 }, { "epoch": 1.088, "grad_norm": 17.025528439812238, "learning_rate": 3.5391923990498813e-07, "logits/generated": -0.8531489372253418, "logits/real": -0.8339968919754028, "logps/generated": -267.8252868652344, "logps/real": -245.0263214111328, "loss": 0.5376, "rewards/accuracies": 0.925000011920929, "rewards/generated": 1.1969783306121826, "rewards/margins": 1.7879966497421265, "rewards/real": 2.9849750995635986, "step": 340 }, { "epoch": 1.12, "grad_norm": 17.981085687546955, "learning_rate": 3.479809976247031e-07, "logits/generated": -0.9094909429550171, "logits/real": -0.8518913388252258, "logps/generated": -274.1060791015625, "logps/real": -244.02694702148438, "loss": 0.5489, "rewards/accuracies": 0.8500000238418579, "rewards/generated": 1.431758165359497, "rewards/margins": 1.3506686687469482, "rewards/real": 2.7824270725250244, "step": 350 }, { "epoch": 1.152, "grad_norm": 26.55100517358171, "learning_rate": 3.42042755344418e-07, "logits/generated": -0.8977092504501343, "logits/real": -0.8440514802932739, "logps/generated": -265.42120361328125, "logps/real": -247.129150390625, "loss": 0.5427, "rewards/accuracies": 0.887499988079071, "rewards/generated": 1.9195735454559326, "rewards/margins": 1.6839573383331299, "rewards/real": 3.6035308837890625, "step": 360 }, { "epoch": 1.184, "grad_norm": 23.41100454470328, "learning_rate": 3.36104513064133e-07, "logits/generated": -0.8342397809028625, "logits/real": -0.783348798751831, "logps/generated": -265.4745178222656, "logps/real": -236.82321166992188, "loss": 0.5016, "rewards/accuracies": 0.8999999761581421, "rewards/generated": 1.7445224523544312, "rewards/margins": 1.8435771465301514, "rewards/real": 3.588099718093872, "step": 370 }, { "epoch": 1.1904, "eval_logits/generated": -0.8191629648208618, "eval_logits/real": -0.7952790856361389, "eval_logps/generated": -261.9469299316406, "eval_logps/real": -227.73184204101562, "eval_loss": 0.5914410948753357, "eval_rewards/accuracies": 0.8269230723381042, "eval_rewards/generated": 2.072218418121338, "eval_rewards/margins": 1.5235180854797363, "eval_rewards/real": 3.595736265182495, "eval_runtime": 24.7514, "eval_samples_per_second": 8.08, "eval_steps_per_second": 0.525, "step": 372 }, { "epoch": 1.216, "grad_norm": 16.27728226394858, "learning_rate": 3.3016627078384796e-07, "logits/generated": -0.8343329429626465, "logits/real": -0.7670712471008301, "logps/generated": -251.3539276123047, "logps/real": -231.7130889892578, "loss": 0.5315, "rewards/accuracies": 0.8500000238418579, "rewards/generated": 1.9938310384750366, "rewards/margins": 1.6337674856185913, "rewards/real": 3.627598285675049, "step": 380 }, { "epoch": 1.248, "grad_norm": 29.907639867801137, "learning_rate": 3.2422802850356297e-07, "logits/generated": -0.843111515045166, "logits/real": -0.7534579634666443, "logps/generated": -269.682373046875, "logps/real": -230.6376190185547, "loss": 0.5022, "rewards/accuracies": 0.9375, "rewards/generated": 1.7457554340362549, "rewards/margins": 2.0438263416290283, "rewards/real": 3.789581298828125, "step": 390 }, { "epoch": 1.28, "grad_norm": 22.973012647791215, "learning_rate": 3.182897862232779e-07, "logits/generated": -0.8282259702682495, "logits/real": -0.7829388380050659, "logps/generated": -267.76141357421875, "logps/real": -240.7711639404297, "loss": 0.5055, "rewards/accuracies": 0.9125000238418579, "rewards/generated": 1.7659053802490234, "rewards/margins": 1.8759300708770752, "rewards/real": 3.6418349742889404, "step": 400 }, { "epoch": 1.312, "grad_norm": 26.685885473347977, "learning_rate": 3.1235154394299283e-07, "logits/generated": -0.9135479927062988, "logits/real": -0.8400457501411438, "logps/generated": -266.3760070800781, "logps/real": -248.6231231689453, "loss": 0.5199, "rewards/accuracies": 0.887499988079071, "rewards/generated": 1.9272260665893555, "rewards/margins": 1.7287628650665283, "rewards/real": 3.655989170074463, "step": 410 }, { "epoch": 1.3439999999999999, "grad_norm": 23.295071750271106, "learning_rate": 3.0641330166270784e-07, "logits/generated": -0.8751770257949829, "logits/real": -0.8301418423652649, "logps/generated": -267.1336975097656, "logps/real": -227.45382690429688, "loss": 0.5358, "rewards/accuracies": 0.9624999761581421, "rewards/generated": 1.6288114786148071, "rewards/margins": 2.2259325981140137, "rewards/real": 3.8547439575195312, "step": 420 }, { "epoch": 1.376, "grad_norm": 32.05157391482628, "learning_rate": 3.004750593824228e-07, "logits/generated": -0.8668873906135559, "logits/real": -0.8098496198654175, "logps/generated": -264.3621520996094, "logps/real": -226.6013641357422, "loss": 0.5296, "rewards/accuracies": 0.925000011920929, "rewards/generated": 2.0178799629211426, "rewards/margins": 2.1578879356384277, "rewards/real": 4.1757683753967285, "step": 430 }, { "epoch": 1.3888, "eval_logits/generated": -0.8476991653442383, "eval_logits/real": -0.8234462141990662, "eval_logps/generated": -257.1005554199219, "eval_logps/real": -221.76821899414062, "eval_loss": 0.5808614492416382, "eval_rewards/accuracies": 0.8461538553237915, "eval_rewards/generated": 2.556854724884033, "eval_rewards/margins": 1.6352410316467285, "eval_rewards/real": 4.192095756530762, "eval_runtime": 32.3032, "eval_samples_per_second": 6.191, "eval_steps_per_second": 0.402, "step": 434 }, { "epoch": 1.408, "grad_norm": 18.40142232186995, "learning_rate": 2.9453681710213776e-07, "logits/generated": -0.8697894811630249, "logits/real": -0.8657002449035645, "logps/generated": -267.8741455078125, "logps/real": -235.3574676513672, "loss": 0.5115, "rewards/accuracies": 0.862500011920929, "rewards/generated": 2.1429734230041504, "rewards/margins": 1.742006540298462, "rewards/real": 3.8849799633026123, "step": 440 }, { "epoch": 1.44, "grad_norm": 19.82491805909192, "learning_rate": 2.885985748218527e-07, "logits/generated": -0.9570973515510559, "logits/real": -0.882495105266571, "logps/generated": -251.5717315673828, "logps/real": -233.5342254638672, "loss": 0.4804, "rewards/accuracies": 0.8999999761581421, "rewards/generated": 2.2679808139801025, "rewards/margins": 2.0746665000915527, "rewards/real": 4.342647552490234, "step": 450 }, { "epoch": 1.472, "grad_norm": 26.87479362806195, "learning_rate": 2.8266033254156767e-07, "logits/generated": -1.0152076482772827, "logits/real": -0.9541398286819458, "logps/generated": -257.68304443359375, "logps/real": -243.43212890625, "loss": 0.5026, "rewards/accuracies": 0.862500011920929, "rewards/generated": 2.0849735736846924, "rewards/margins": 1.7475240230560303, "rewards/real": 3.8324978351593018, "step": 460 }, { "epoch": 1.504, "grad_norm": 21.259236036439386, "learning_rate": 2.7672209026128263e-07, "logits/generated": -0.9456484913825989, "logits/real": -0.8882578015327454, "logps/generated": -264.85626220703125, "logps/real": -232.17562866210938, "loss": 0.4713, "rewards/accuracies": 0.9125000238418579, "rewards/generated": 2.243791341781616, "rewards/margins": 1.9264971017837524, "rewards/real": 4.1702880859375, "step": 470 }, { "epoch": 1.536, "grad_norm": 20.831486547707065, "learning_rate": 2.7078384798099764e-07, "logits/generated": -0.8949400782585144, "logits/real": -0.8671590685844421, "logps/generated": -265.330810546875, "logps/real": -234.21072387695312, "loss": 0.4643, "rewards/accuracies": 0.925000011920929, "rewards/generated": 2.0267763137817383, "rewards/margins": 2.2522521018981934, "rewards/real": 4.279028415679932, "step": 480 }, { "epoch": 1.568, "grad_norm": 18.959858317370355, "learning_rate": 2.648456057007126e-07, "logits/generated": -0.8721327781677246, "logits/real": -0.8695019483566284, "logps/generated": -269.15728759765625, "logps/real": -227.70474243164062, "loss": 0.4344, "rewards/accuracies": 0.9125000238418579, "rewards/generated": 1.667468786239624, "rewards/margins": 2.39231538772583, "rewards/real": 4.059783935546875, "step": 490 }, { "epoch": 1.5872000000000002, "eval_logits/generated": -0.8473695516586304, "eval_logits/real": -0.8298014402389526, "eval_logps/generated": -255.7716522216797, "eval_logps/real": -218.99940490722656, "eval_loss": 0.5769486427307129, "eval_rewards/accuracies": 0.8461538553237915, "eval_rewards/generated": 2.6897456645965576, "eval_rewards/margins": 1.7792328596115112, "eval_rewards/real": 4.468978404998779, "eval_runtime": 40.3112, "eval_samples_per_second": 4.961, "eval_steps_per_second": 0.322, "step": 496 }, { "epoch": 1.6, "grad_norm": 38.176675398361986, "learning_rate": 2.589073634204275e-07, "logits/generated": -0.925214946269989, "logits/real": -0.8996881246566772, "logps/generated": -267.95648193359375, "logps/real": -229.0358428955078, "loss": 0.4787, "rewards/accuracies": 0.949999988079071, "rewards/generated": 1.880769968032837, "rewards/margins": 2.249277353286743, "rewards/real": 4.13004732131958, "step": 500 }, { "epoch": 1.6320000000000001, "grad_norm": 32.83016622890067, "learning_rate": 2.529691211401425e-07, "logits/generated": -0.9375748634338379, "logits/real": -0.9112977981567383, "logps/generated": -251.23751831054688, "logps/real": -233.52267456054688, "loss": 0.4992, "rewards/accuracies": 0.925000011920929, "rewards/generated": 2.485579013824463, "rewards/margins": 2.006443500518799, "rewards/real": 4.4920220375061035, "step": 510 }, { "epoch": 1.6640000000000001, "grad_norm": 64.7570260964563, "learning_rate": 2.4703087885985747e-07, "logits/generated": -0.9547020792961121, "logits/real": -0.9207860827445984, "logps/generated": -268.57049560546875, "logps/real": -231.99697875976562, "loss": 0.4935, "rewards/accuracies": 0.9125000238418579, "rewards/generated": 1.8926427364349365, "rewards/margins": 2.454348564147949, "rewards/real": 4.346991539001465, "step": 520 }, { "epoch": 1.696, "grad_norm": 24.570258988531133, "learning_rate": 2.410926365795724e-07, "logits/generated": -0.9156527519226074, "logits/real": -0.8967121839523315, "logps/generated": -261.8643493652344, "logps/real": -231.90609741210938, "loss": 0.4815, "rewards/accuracies": 0.8999999761581421, "rewards/generated": 2.1577658653259277, "rewards/margins": 2.177294969558716, "rewards/real": 4.335061073303223, "step": 530 }, { "epoch": 1.728, "grad_norm": 15.482901282995309, "learning_rate": 2.351543942992874e-07, "logits/generated": -0.8593676686286926, "logits/real": -0.8424193263053894, "logps/generated": -258.342529296875, "logps/real": -224.79177856445312, "loss": 0.4573, "rewards/accuracies": 0.925000011920929, "rewards/generated": 2.0410842895507812, "rewards/margins": 2.2667489051818848, "rewards/real": 4.307833671569824, "step": 540 }, { "epoch": 1.76, "grad_norm": 14.61497373758776, "learning_rate": 2.2921615201900234e-07, "logits/generated": -0.9146724939346313, "logits/real": -0.9038121104240417, "logps/generated": -257.42144775390625, "logps/real": -228.86062622070312, "loss": 0.513, "rewards/accuracies": 0.8999999761581421, "rewards/generated": 2.380136013031006, "rewards/margins": 2.0490469932556152, "rewards/real": 4.429183483123779, "step": 550 }, { "epoch": 1.7856, "eval_logits/generated": -0.8718780875205994, "eval_logits/real": -0.8539044260978699, "eval_logps/generated": -253.72958374023438, "eval_logps/real": -217.2036895751953, "eval_loss": 0.5655896663665771, "eval_rewards/accuracies": 0.8461538553237915, "eval_rewards/generated": 2.8939502239227295, "eval_rewards/margins": 1.7546011209487915, "eval_rewards/real": 4.648550987243652, "eval_runtime": 40.7523, "eval_samples_per_second": 4.908, "eval_steps_per_second": 0.319, "step": 558 }, { "epoch": 1.792, "grad_norm": 29.65165132547599, "learning_rate": 2.2327790973871732e-07, "logits/generated": -0.9448938369750977, "logits/real": -0.9100682139396667, "logps/generated": -257.49267578125, "logps/real": -223.6540985107422, "loss": 0.4773, "rewards/accuracies": 0.9375, "rewards/generated": 2.7721714973449707, "rewards/margins": 2.1354167461395264, "rewards/real": 4.907588958740234, "step": 560 }, { "epoch": 1.8239999999999998, "grad_norm": 20.607939044053108, "learning_rate": 2.173396674584323e-07, "logits/generated": -0.9359772801399231, "logits/real": -0.9404600262641907, "logps/generated": -261.67462158203125, "logps/real": -218.80960083007812, "loss": 0.4987, "rewards/accuracies": 0.887499988079071, "rewards/generated": 2.250516891479492, "rewards/margins": 1.9804651737213135, "rewards/real": 4.230982303619385, "step": 570 }, { "epoch": 1.8559999999999999, "grad_norm": 26.04078915528857, "learning_rate": 2.1140142517814726e-07, "logits/generated": -0.9250372648239136, "logits/real": -0.8979923129081726, "logps/generated": -264.47955322265625, "logps/real": -234.3893585205078, "loss": 0.5084, "rewards/accuracies": 0.8500000238418579, "rewards/generated": 2.458308458328247, "rewards/margins": 2.1246516704559326, "rewards/real": 4.5829596519470215, "step": 580 }, { "epoch": 1.888, "grad_norm": 23.107054615514254, "learning_rate": 2.0546318289786222e-07, "logits/generated": -0.9009197354316711, "logits/real": -0.8869778513908386, "logps/generated": -258.01361083984375, "logps/real": -232.7992706298828, "loss": 0.4706, "rewards/accuracies": 0.862500011920929, "rewards/generated": 2.3066296577453613, "rewards/margins": 2.044299364089966, "rewards/real": 4.350929260253906, "step": 590 }, { "epoch": 1.92, "grad_norm": 26.77770924366384, "learning_rate": 1.9952494061757718e-07, "logits/generated": -0.8508394360542297, "logits/real": -0.8669518232345581, "logps/generated": -265.9466552734375, "logps/real": -228.459716796875, "loss": 0.5061, "rewards/accuracies": 0.925000011920929, "rewards/generated": 2.2428016662597656, "rewards/margins": 2.0594842433929443, "rewards/real": 4.302285671234131, "step": 600 }, { "epoch": 1.952, "grad_norm": 25.692114586650828, "learning_rate": 1.9358669833729216e-07, "logits/generated": -0.8534078598022461, "logits/real": -0.8465960621833801, "logps/generated": -263.1025695800781, "logps/real": -218.86709594726562, "loss": 0.4671, "rewards/accuracies": 0.862500011920929, "rewards/generated": 2.586224317550659, "rewards/margins": 2.0035674571990967, "rewards/real": 4.589792251586914, "step": 610 }, { "epoch": 1.984, "grad_norm": 21.446598233605766, "learning_rate": 1.876484560570071e-07, "logits/generated": -0.8966739773750305, "logits/real": -0.8015346527099609, "logps/generated": -266.8609924316406, "logps/real": -227.60147094726562, "loss": 0.4632, "rewards/accuracies": 0.9375, "rewards/generated": 2.155893564224243, "rewards/margins": 2.3377389907836914, "rewards/real": 4.4936323165893555, "step": 620 }, { "epoch": 1.984, "eval_logits/generated": -0.8338809609413147, "eval_logits/real": -0.8251470327377319, "eval_logps/generated": -253.3907928466797, "eval_logps/real": -216.5598907470703, "eval_loss": 0.5639179348945618, "eval_rewards/accuracies": 0.8461538553237915, "eval_rewards/generated": 2.9278318881988525, "eval_rewards/margins": 1.7850979566574097, "eval_rewards/real": 4.712929725646973, "eval_runtime": 38.9653, "eval_samples_per_second": 5.133, "eval_steps_per_second": 0.334, "step": 620 }, { "epoch": 2.016, "grad_norm": 13.278936423720094, "learning_rate": 1.8171021377672207e-07, "logits/generated": -0.8643847703933716, "logits/real": -0.8604650497436523, "logps/generated": -250.55349731445312, "logps/real": -220.19482421875, "loss": 0.4295, "rewards/accuracies": 0.9624999761581421, "rewards/generated": 2.490985631942749, "rewards/margins": 2.265563726425171, "rewards/real": 4.75654935836792, "step": 630 }, { "epoch": 2.048, "grad_norm": 14.220248158603162, "learning_rate": 1.7577197149643706e-07, "logits/generated": -0.8905068635940552, "logits/real": -0.9017655253410339, "logps/generated": -256.44488525390625, "logps/real": -231.462890625, "loss": 0.3853, "rewards/accuracies": 0.987500011920929, "rewards/generated": 2.124544858932495, "rewards/margins": 2.6344566345214844, "rewards/real": 4.759001731872559, "step": 640 }, { "epoch": 2.08, "grad_norm": 26.680004876796023, "learning_rate": 1.6983372921615202e-07, "logits/generated": -0.9198824167251587, "logits/real": -0.8907524347305298, "logps/generated": -267.86505126953125, "logps/real": -229.6920623779297, "loss": 0.3975, "rewards/accuracies": 0.9750000238418579, "rewards/generated": 2.2710652351379395, "rewards/margins": 2.368361711502075, "rewards/real": 4.639427661895752, "step": 650 }, { "epoch": 2.112, "grad_norm": 21.73718070190174, "learning_rate": 1.6389548693586697e-07, "logits/generated": -0.965892493724823, "logits/real": -0.9026229977607727, "logps/generated": -255.9954376220703, "logps/real": -225.38803100585938, "loss": 0.3846, "rewards/accuracies": 0.9624999761581421, "rewards/generated": 2.1424877643585205, "rewards/margins": 2.7220892906188965, "rewards/real": 4.864577293395996, "step": 660 }, { "epoch": 2.144, "grad_norm": 17.5024622116479, "learning_rate": 1.5795724465558193e-07, "logits/generated": -0.9057500958442688, "logits/real": -0.8590014576911926, "logps/generated": -245.63809204101562, "logps/real": -222.30953979492188, "loss": 0.4015, "rewards/accuracies": 0.9624999761581421, "rewards/generated": 2.4586942195892334, "rewards/margins": 2.442350149154663, "rewards/real": 4.901044845581055, "step": 670 }, { "epoch": 2.176, "grad_norm": 22.63825175907538, "learning_rate": 1.520190023752969e-07, "logits/generated": -0.9184467196464539, "logits/real": -0.8965224027633667, "logps/generated": -265.4300842285156, "logps/real": -216.04110717773438, "loss": 0.391, "rewards/accuracies": 0.9624999761581421, "rewards/generated": 2.0234487056732178, "rewards/margins": 2.4886739253997803, "rewards/real": 4.512122631072998, "step": 680 }, { "epoch": 2.1824, "eval_logits/generated": -0.8728317618370056, "eval_logits/real": -0.8685613870620728, "eval_logps/generated": -253.65782165527344, "eval_logps/real": -215.30899047851562, "eval_loss": 0.5555368661880493, "eval_rewards/accuracies": 0.8461538553237915, "eval_rewards/generated": 2.9011244773864746, "eval_rewards/margins": 1.9368971586227417, "eval_rewards/real": 4.838022232055664, "eval_runtime": 34.2926, "eval_samples_per_second": 5.832, "eval_steps_per_second": 0.379, "step": 682 }, { "epoch": 2.208, "grad_norm": 13.119772512292641, "learning_rate": 1.4608076009501184e-07, "logits/generated": -0.8462353944778442, "logits/real": -0.8793772459030151, "logps/generated": -262.62139892578125, "logps/real": -217.6599884033203, "loss": 0.3873, "rewards/accuracies": 0.9624999761581421, "rewards/generated": 2.167581796646118, "rewards/margins": 2.5605719089508057, "rewards/real": 4.728153705596924, "step": 690 }, { "epoch": 2.24, "grad_norm": 12.844512060104332, "learning_rate": 1.4014251781472683e-07, "logits/generated": -0.921069324016571, "logits/real": -0.9563784599304199, "logps/generated": -265.6938781738281, "logps/real": -239.77987670898438, "loss": 0.4038, "rewards/accuracies": 0.925000011920929, "rewards/generated": 1.8810170888900757, "rewards/margins": 2.4677670001983643, "rewards/real": 4.348784446716309, "step": 700 }, { "epoch": 2.2720000000000002, "grad_norm": 11.11315802823275, "learning_rate": 1.342042755344418e-07, "logits/generated": -0.9593836665153503, "logits/real": -0.9604368209838867, "logps/generated": -262.6346435546875, "logps/real": -228.54751586914062, "loss": 0.3943, "rewards/accuracies": 0.9125000238418579, "rewards/generated": 2.214599132537842, "rewards/margins": 2.426208734512329, "rewards/real": 4.640807628631592, "step": 710 }, { "epoch": 2.304, "grad_norm": 15.529926292414983, "learning_rate": 1.2826603325415677e-07, "logits/generated": -0.9164473414421082, "logits/real": -0.9070721864700317, "logps/generated": -254.88113403320312, "logps/real": -222.6805419921875, "loss": 0.4011, "rewards/accuracies": 0.9624999761581421, "rewards/generated": 2.0439538955688477, "rewards/margins": 2.852200984954834, "rewards/real": 4.896154880523682, "step": 720 }, { "epoch": 2.336, "grad_norm": 18.280305729890717, "learning_rate": 1.2232779097387173e-07, "logits/generated": -0.9039872884750366, "logits/real": -0.932706356048584, "logps/generated": -263.90277099609375, "logps/real": -230.5477752685547, "loss": 0.3794, "rewards/accuracies": 0.987500011920929, "rewards/generated": 1.9140474796295166, "rewards/margins": 2.7077784538269043, "rewards/real": 4.621825218200684, "step": 730 }, { "epoch": 2.368, "grad_norm": 12.973027129913685, "learning_rate": 1.163895486935867e-07, "logits/generated": -0.9551565051078796, "logits/real": -0.8936797976493835, "logps/generated": -252.80905151367188, "logps/real": -222.890380859375, "loss": 0.3823, "rewards/accuracies": 0.9750000238418579, "rewards/generated": 2.467062473297119, "rewards/margins": 2.3569140434265137, "rewards/real": 4.823976516723633, "step": 740 }, { "epoch": 2.3808, "eval_logits/generated": -0.8613379597663879, "eval_logits/real": -0.861678421497345, "eval_logps/generated": -252.93264770507812, "eval_logps/real": -214.26817321777344, "eval_loss": 0.5525192022323608, "eval_rewards/accuracies": 0.8461538553237915, "eval_rewards/generated": 2.973644733428955, "eval_rewards/margins": 1.9684556722640991, "eval_rewards/real": 4.942100524902344, "eval_runtime": 33.7536, "eval_samples_per_second": 5.925, "eval_steps_per_second": 0.385, "step": 744 }, { "epoch": 2.4, "grad_norm": 29.433270333539912, "learning_rate": 1.1045130641330165e-07, "logits/generated": -0.9705144166946411, "logits/real": -0.9021897315979004, "logps/generated": -259.4241638183594, "logps/real": -229.11349487304688, "loss": 0.4048, "rewards/accuracies": 0.987500011920929, "rewards/generated": 2.026224136352539, "rewards/margins": 2.647815227508545, "rewards/real": 4.674039363861084, "step": 750 }, { "epoch": 2.432, "grad_norm": 20.33942780848402, "learning_rate": 1.0451306413301662e-07, "logits/generated": -0.9670873880386353, "logits/real": -0.9735347628593445, "logps/generated": -260.3061218261719, "logps/real": -226.3241729736328, "loss": 0.3964, "rewards/accuracies": 0.9750000238418579, "rewards/generated": 2.24208402633667, "rewards/margins": 2.5879955291748047, "rewards/real": 4.830079555511475, "step": 760 }, { "epoch": 2.464, "grad_norm": 18.69032739525244, "learning_rate": 9.857482185273158e-08, "logits/generated": -0.895625114440918, "logits/real": -0.9686501622200012, "logps/generated": -261.86175537109375, "logps/real": -214.92385864257812, "loss": 0.3943, "rewards/accuracies": 0.9624999761581421, "rewards/generated": 2.406226634979248, "rewards/margins": 2.6910176277160645, "rewards/real": 5.0972442626953125, "step": 770 }, { "epoch": 2.496, "grad_norm": 24.59655548429365, "learning_rate": 9.263657957244655e-08, "logits/generated": -0.8825882077217102, "logits/real": -0.8406922221183777, "logps/generated": -264.40838623046875, "logps/real": -228.5029296875, "loss": 0.3851, "rewards/accuracies": 0.9624999761581421, "rewards/generated": 2.372819423675537, "rewards/margins": 2.5542354583740234, "rewards/real": 4.927054405212402, "step": 780 }, { "epoch": 2.528, "grad_norm": 21.19722087138631, "learning_rate": 8.669833729216151e-08, "logits/generated": -0.904650866985321, "logits/real": -0.9072428941726685, "logps/generated": -261.8887939453125, "logps/real": -216.5327606201172, "loss": 0.3713, "rewards/accuracies": 1.0, "rewards/generated": 2.101536989212036, "rewards/margins": 2.913972854614258, "rewards/real": 5.015509605407715, "step": 790 }, { "epoch": 2.56, "grad_norm": 36.11799371051388, "learning_rate": 8.076009501187649e-08, "logits/generated": -0.9086967706680298, "logits/real": -0.8840044736862183, "logps/generated": -246.6894989013672, "logps/real": -217.89419555664062, "loss": 0.3705, "rewards/accuracies": 0.9750000238418579, "rewards/generated": 2.551680564880371, "rewards/margins": 2.705740213394165, "rewards/real": 5.257420539855957, "step": 800 }, { "epoch": 2.5792, "eval_logits/generated": -0.8641340732574463, "eval_logits/real": -0.8686401844024658, "eval_logps/generated": -252.9866485595703, "eval_logps/real": -213.82852172851562, "eval_loss": 0.5512068867683411, "eval_rewards/accuracies": 0.8653846383094788, "eval_rewards/generated": 2.9682469367980957, "eval_rewards/margins": 2.0178189277648926, "eval_rewards/real": 4.9860663414001465, "eval_runtime": 35.2986, "eval_samples_per_second": 5.666, "eval_steps_per_second": 0.368, "step": 806 }, { "epoch": 2.592, "grad_norm": 20.653892107957766, "learning_rate": 7.482185273159145e-08, "logits/generated": -0.9367928504943848, "logits/real": -0.9458833932876587, "logps/generated": -259.1912841796875, "logps/real": -220.80810546875, "loss": 0.3699, "rewards/accuracies": 0.9750000238418579, "rewards/generated": 2.3820133209228516, "rewards/margins": 2.902805805206299, "rewards/real": 5.28481912612915, "step": 810 }, { "epoch": 2.624, "grad_norm": 22.14418789305639, "learning_rate": 6.88836104513064e-08, "logits/generated": -0.9513182640075684, "logits/real": -0.976198673248291, "logps/generated": -262.0443115234375, "logps/real": -226.5956573486328, "loss": 0.3708, "rewards/accuracies": 0.9750000238418579, "rewards/generated": 2.108731746673584, "rewards/margins": 2.8131000995635986, "rewards/real": 4.9218316078186035, "step": 820 }, { "epoch": 2.656, "grad_norm": 14.79622491268645, "learning_rate": 6.294536817102138e-08, "logits/generated": -0.9285460710525513, "logits/real": -0.9467372894287109, "logps/generated": -256.03692626953125, "logps/real": -219.3931884765625, "loss": 0.3779, "rewards/accuracies": 0.9375, "rewards/generated": 2.6489083766937256, "rewards/margins": 2.2911267280578613, "rewards/real": 4.94003438949585, "step": 830 }, { "epoch": 2.6879999999999997, "grad_norm": 15.043049371394394, "learning_rate": 5.700712589073634e-08, "logits/generated": -0.9104442596435547, "logits/real": -0.938060462474823, "logps/generated": -266.68731689453125, "logps/real": -215.4854278564453, "loss": 0.3613, "rewards/accuracies": 0.9624999761581421, "rewards/generated": 2.136897563934326, "rewards/margins": 2.9992494583129883, "rewards/real": 5.136146545410156, "step": 840 }, { "epoch": 2.7199999999999998, "grad_norm": 15.242872844336908, "learning_rate": 5.10688836104513e-08, "logits/generated": -0.9151053428649902, "logits/real": -0.9172986149787903, "logps/generated": -268.2730712890625, "logps/real": -224.15652465820312, "loss": 0.3743, "rewards/accuracies": 0.987500011920929, "rewards/generated": 1.8564279079437256, "rewards/margins": 3.136321544647217, "rewards/real": 4.99275016784668, "step": 850 }, { "epoch": 2.752, "grad_norm": 17.468120226063807, "learning_rate": 4.5130641330166267e-08, "logits/generated": -0.9360349774360657, "logits/real": -0.9112297892570496, "logps/generated": -252.3114776611328, "logps/real": -221.2039337158203, "loss": 0.3718, "rewards/accuracies": 0.9750000238418579, "rewards/generated": 2.2368216514587402, "rewards/margins": 2.8169872760772705, "rewards/real": 5.05380916595459, "step": 860 }, { "epoch": 2.7776, "eval_logits/generated": -0.8635996580123901, "eval_logits/real": -0.8680248260498047, "eval_logps/generated": -252.94522094726562, "eval_logps/real": -213.61854553222656, "eval_loss": 0.5554559230804443, "eval_rewards/accuracies": 0.8461538553237915, "eval_rewards/generated": 2.9723880290985107, "eval_rewards/margins": 2.0346765518188477, "eval_rewards/real": 5.007064342498779, "eval_runtime": 33.7658, "eval_samples_per_second": 5.923, "eval_steps_per_second": 0.385, "step": 868 }, { "epoch": 2.784, "grad_norm": 23.996876211666585, "learning_rate": 3.919239904988123e-08, "logits/generated": -0.9030688405036926, "logits/real": -0.8869367837905884, "logps/generated": -263.31884765625, "logps/real": -231.1215057373047, "loss": 0.3838, "rewards/accuracies": 0.987500011920929, "rewards/generated": 2.1591622829437256, "rewards/margins": 2.8290388584136963, "rewards/real": 4.988200664520264, "step": 870 }, { "epoch": 2.816, "grad_norm": 15.933355481968098, "learning_rate": 3.32541567695962e-08, "logits/generated": -0.9254922866821289, "logits/real": -0.9375091791152954, "logps/generated": -257.25921630859375, "logps/real": -218.4817352294922, "loss": 0.3943, "rewards/accuracies": 0.9624999761581421, "rewards/generated": 2.433042049407959, "rewards/margins": 2.4680287837982178, "rewards/real": 4.901071071624756, "step": 880 }, { "epoch": 2.848, "grad_norm": 15.729400330653592, "learning_rate": 2.7315914489311164e-08, "logits/generated": -0.8918999433517456, "logits/real": -0.9543954730033875, "logps/generated": -251.86434936523438, "logps/real": -217.92153930664062, "loss": 0.3792, "rewards/accuracies": 0.9375, "rewards/generated": 2.4217216968536377, "rewards/margins": 2.721459150314331, "rewards/real": 5.143180847167969, "step": 890 }, { "epoch": 2.88, "grad_norm": 14.591897761657874, "learning_rate": 2.1377672209026125e-08, "logits/generated": -0.9395130276679993, "logits/real": -0.9594659805297852, "logps/generated": -265.73876953125, "logps/real": -222.98593139648438, "loss": 0.3686, "rewards/accuracies": 0.987500011920929, "rewards/generated": 2.46151065826416, "rewards/margins": 2.638683795928955, "rewards/real": 5.100194931030273, "step": 900 }, { "epoch": 2.912, "grad_norm": 20.099085379716534, "learning_rate": 1.5439429928741092e-08, "logits/generated": -0.950794517993927, "logits/real": -0.8883028030395508, "logps/generated": -254.50732421875, "logps/real": -226.67514038085938, "loss": 0.3771, "rewards/accuracies": 0.925000011920929, "rewards/generated": 2.2818403244018555, "rewards/margins": 2.8682749271392822, "rewards/real": 5.150115013122559, "step": 910 }, { "epoch": 2.944, "grad_norm": 27.113250586107085, "learning_rate": 9.501187648456057e-09, "logits/generated": -0.9123473167419434, "logits/real": -0.8874839544296265, "logps/generated": -254.9096221923828, "logps/real": -226.13595581054688, "loss": 0.3878, "rewards/accuracies": 0.9750000238418579, "rewards/generated": 2.072730302810669, "rewards/margins": 2.4795820713043213, "rewards/real": 4.55231237411499, "step": 920 }, { "epoch": 2.976, "grad_norm": 18.727290911205092, "learning_rate": 3.562945368171021e-09, "logits/generated": -0.9185419082641602, "logits/real": -0.9089628458023071, "logps/generated": -247.39254760742188, "logps/real": -213.3396453857422, "loss": 0.4001, "rewards/accuracies": 0.925000011920929, "rewards/generated": 2.629145860671997, "rewards/margins": 2.3609352111816406, "rewards/real": 4.990080833435059, "step": 930 }, { "epoch": 2.976, "eval_logits/generated": -0.874191164970398, "eval_logits/real": -0.8781108260154724, "eval_logps/generated": -253.27622985839844, "eval_logps/real": -213.75250244140625, "eval_loss": 0.552344799041748, "eval_rewards/accuracies": 0.8461538553237915, "eval_rewards/generated": 2.939284324645996, "eval_rewards/margins": 2.054386854171753, "eval_rewards/real": 4.993671894073486, "eval_runtime": 34.0216, "eval_samples_per_second": 5.879, "eval_steps_per_second": 0.382, "step": 930 }, { "epoch": 2.9952, "step": 936, "total_flos": 0.0, "train_loss": 0.5569279177321328, "train_runtime": 15113.3849, "train_samples_per_second": 1.983, "train_steps_per_second": 0.062 } ], "logging_steps": 10, "max_steps": 936, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 62, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 4, "trial_name": null, "trial_params": null }