{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 100, "global_step": 157, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.006369426751592357, "grad_norm": 553.6689195387686, "learning_rate": 3.125e-08, "logits/generated": -2.5852508544921875, "logits/real": -2.6413676738739014, "logps/generated": -325.0230407714844, "logps/real": -285.6551513671875, "loss": 0.9368, "rewards/accuracies": 0.0, "rewards/generated": 0.0, "rewards/margins": 0.0, "rewards/real": 0.0, "step": 1 }, { "epoch": 0.06369426751592357, "grad_norm": 114.25554694751857, "learning_rate": 3.1249999999999997e-07, "logits/generated": -2.6055305004119873, "logits/real": -2.5509183406829834, "logps/generated": -363.4412536621094, "logps/real": -227.83956909179688, "loss": 0.6399, "rewards/accuracies": 0.7916666865348816, "rewards/generated": -1.371843934059143, "rewards/margins": 1.6683220863342285, "rewards/real": 0.2964780032634735, "step": 10 }, { "epoch": 0.12738853503184713, "grad_norm": 32.95472745041124, "learning_rate": 4.858156028368794e-07, "logits/generated": -2.859358072280884, "logits/real": -2.843193531036377, "logps/generated": -321.1373596191406, "logps/real": -200.79653930664062, "loss": 0.2658, "rewards/accuracies": 1.0, "rewards/generated": -4.476873874664307, "rewards/margins": 5.808846950531006, "rewards/real": 1.3319734334945679, "step": 20 }, { "epoch": 0.1910828025477707, "grad_norm": 15.376345388496501, "learning_rate": 4.50354609929078e-07, "logits/generated": -2.8705520629882812, "logits/real": -2.8971784114837646, "logps/generated": -368.5296325683594, "logps/real": -192.18089294433594, "loss": 0.2291, "rewards/accuracies": 1.0, "rewards/generated": -5.970882415771484, "rewards/margins": 9.524211883544922, "rewards/real": 3.553328275680542, "step": 30 }, { "epoch": 0.25477707006369427, "grad_norm": 32.240493773919496, "learning_rate": 4.148936170212766e-07, "logits/generated": -2.9980812072753906, "logits/real": -2.925062656402588, "logps/generated": -352.6733093261719, "logps/real": -184.546142578125, "loss": 0.2192, "rewards/accuracies": 0.9624999761581421, "rewards/generated": -5.158407688140869, "rewards/margins": 9.533308029174805, "rewards/real": 4.374899864196777, "step": 40 }, { "epoch": 0.3184713375796178, "grad_norm": 22.443882761667297, "learning_rate": 3.7943262411347514e-07, "logits/generated": -2.9874608516693115, "logits/real": -2.901106357574463, "logps/generated": -366.24774169921875, "logps/real": -178.15200805664062, "loss": 0.1918, "rewards/accuracies": 1.0, "rewards/generated": -5.891494274139404, "rewards/margins": 10.835145950317383, "rewards/real": 4.94365119934082, "step": 50 }, { "epoch": 0.3821656050955414, "grad_norm": 27.90941399161424, "learning_rate": 3.4397163120567375e-07, "logits/generated": -2.9581241607666016, "logits/real": -2.923001527786255, "logps/generated": -340.056884765625, "logps/real": -175.193603515625, "loss": 0.2088, "rewards/accuracies": 1.0, "rewards/generated": -4.940432548522949, "rewards/margins": 10.540928840637207, "rewards/real": 5.6004958152771, "step": 60 }, { "epoch": 0.445859872611465, "grad_norm": 19.61548013874602, "learning_rate": 3.085106382978723e-07, "logits/generated": -2.950155735015869, "logits/real": -2.920978546142578, "logps/generated": -374.2133483886719, "logps/real": -176.62344360351562, "loss": 0.1973, "rewards/accuracies": 0.9750000238418579, "rewards/generated": -5.149114608764648, "rewards/margins": 11.470724105834961, "rewards/real": 6.321610450744629, "step": 70 }, { "epoch": 0.5095541401273885, "grad_norm": 7.313023868270791, "learning_rate": 2.730496453900709e-07, "logits/generated": -2.891042709350586, "logits/real": -2.816384792327881, "logps/generated": -343.1810607910156, "logps/real": -187.75689697265625, "loss": 0.1921, "rewards/accuracies": 0.987500011920929, "rewards/generated": -4.03465461730957, "rewards/margins": 11.060041427612305, "rewards/real": 7.025385856628418, "step": 80 }, { "epoch": 0.5732484076433121, "grad_norm": 5.902065577304844, "learning_rate": 2.375886524822695e-07, "logits/generated": -2.7973499298095703, "logits/real": -2.7673676013946533, "logps/generated": -361.3358154296875, "logps/real": -168.44692993164062, "loss": 0.1829, "rewards/accuracies": 0.9750000238418579, "rewards/generated": -4.653961658477783, "rewards/margins": 10.882070541381836, "rewards/real": 6.228109359741211, "step": 90 }, { "epoch": 0.6369426751592356, "grad_norm": 21.378621598556965, "learning_rate": 2.0212765957446807e-07, "logits/generated": -2.8296706676483154, "logits/real": -2.783637762069702, "logps/generated": -385.32916259765625, "logps/real": -161.2016143798828, "loss": 0.1804, "rewards/accuracies": 1.0, "rewards/generated": -5.7995285987854, "rewards/margins": 12.244100570678711, "rewards/real": 6.444572448730469, "step": 100 }, { "epoch": 0.6369426751592356, "eval_logits/generated": -2.8830642700195312, "eval_logits/real": -2.8163247108459473, "eval_logps/generated": -304.90185546875, "eval_logps/real": -157.32879638671875, "eval_loss": 0.19246906042099, "eval_rewards/accuracies": 0.9921875, "eval_rewards/generated": -4.66813850402832, "eval_rewards/margins": 10.201096534729004, "eval_rewards/real": 5.532957077026367, "eval_runtime": 40.3701, "eval_samples_per_second": 12.385, "eval_steps_per_second": 0.396, "step": 100 }, { "epoch": 0.7006369426751592, "grad_norm": 4.836243072704085, "learning_rate": 1.6666666666666665e-07, "logits/generated": -2.814863920211792, "logits/real": -2.813689708709717, "logps/generated": -349.199951171875, "logps/real": -167.05197143554688, "loss": 0.1732, "rewards/accuracies": 1.0, "rewards/generated": -4.092279434204102, "rewards/margins": 10.799135208129883, "rewards/real": 6.706856727600098, "step": 110 }, { "epoch": 0.7643312101910829, "grad_norm": 10.2454630213605, "learning_rate": 1.3120567375886523e-07, "logits/generated": -2.6291651725769043, "logits/real": -2.6084656715393066, "logps/generated": -357.1519470214844, "logps/real": -194.95156860351562, "loss": 0.2818, "rewards/accuracies": 0.987500011920929, "rewards/generated": -6.045389175415039, "rewards/margins": 10.61597728729248, "rewards/real": 4.570586681365967, "step": 120 }, { "epoch": 0.8280254777070064, "grad_norm": 30.968094217096088, "learning_rate": 9.574468085106382e-08, "logits/generated": -2.7041964530944824, "logits/real": -2.6806530952453613, "logps/generated": -349.06878662109375, "logps/real": -164.90850830078125, "loss": 0.1784, "rewards/accuracies": 1.0, "rewards/generated": -5.374934196472168, "rewards/margins": 13.002920150756836, "rewards/real": 7.627985954284668, "step": 130 }, { "epoch": 0.89171974522293, "grad_norm": 39.77463051390536, "learning_rate": 6.02836879432624e-08, "logits/generated": -2.705962896347046, "logits/real": -2.6498522758483887, "logps/generated": -338.68121337890625, "logps/real": -155.6457061767578, "loss": 0.1802, "rewards/accuracies": 0.987500011920929, "rewards/generated": -4.318178176879883, "rewards/margins": 11.076618194580078, "rewards/real": 6.758440971374512, "step": 140 }, { "epoch": 0.9554140127388535, "grad_norm": 37.53390149893301, "learning_rate": 2.4822695035460993e-08, "logits/generated": -2.6534829139709473, "logits/real": -2.6331899166107178, "logps/generated": -302.69781494140625, "logps/real": -132.01724243164062, "loss": 0.1803, "rewards/accuracies": 1.0, "rewards/generated": -3.575101375579834, "rewards/margins": 9.780950546264648, "rewards/real": 6.20584774017334, "step": 150 }, { "epoch": 1.0, "step": 157, "total_flos": 0.0, "train_loss": 0.23225101619769054, "train_runtime": 1278.722, "train_samples_per_second": 3.91, "train_steps_per_second": 0.123 } ], "logging_steps": 10, "max_steps": 157, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 100, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 8, "trial_name": null, "trial_params": null }