{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.9984, "eval_steps": 500, "global_step": 156, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.01, "learning_rate": 3.125e-07, "logits/chosen": -0.18967239558696747, "logits/rejected": -0.41899582743644714, "logps/chosen": -1382.499267578125, "logps/rejected": -2863.098388671875, "loss": 0.5, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 1 }, { "epoch": 0.06, "learning_rate": 3.125e-06, "logits/chosen": -0.15270556509494781, "logits/rejected": -0.2915438413619995, "logps/chosen": -2153.170654296875, "logps/rejected": -3371.856689453125, "loss": 0.4997, "rewards/accuracies": 0.3993055522441864, "rewards/chosen": -0.007403078954666853, "rewards/margins": 0.0016438195016235113, "rewards/rejected": -0.00904689822345972, "step": 10 }, { "epoch": 0.13, "learning_rate": 4.989935734988098e-06, "logits/chosen": -0.2100469321012497, "logits/rejected": -0.32501596212387085, "logps/chosen": -2614.16064453125, "logps/rejected": -3642.28076171875, "loss": 0.4963, "rewards/accuracies": 0.4937500059604645, "rewards/chosen": -0.1257179230451584, "rewards/margins": 0.020404411479830742, "rewards/rejected": -0.14612232148647308, "step": 20 }, { "epoch": 0.19, "learning_rate": 4.8776412907378845e-06, "logits/chosen": -0.19536757469177246, "logits/rejected": -0.4390452802181244, "logps/chosen": -2770.899169921875, "logps/rejected": -4525.7744140625, "loss": 0.4889, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -0.5548638701438904, "rewards/margins": 0.1145726665854454, "rewards/rejected": -0.6694365739822388, "step": 30 }, { "epoch": 0.26, "learning_rate": 4.646121984004666e-06, "logits/chosen": -0.23629632592201233, "logits/rejected": -0.5095050930976868, "logps/chosen": -3372.510498046875, "logps/rejected": -5071.37451171875, "loss": 0.4868, "rewards/accuracies": 0.503125011920929, "rewards/chosen": -1.0432322025299072, "rewards/margins": 0.22232362627983093, "rewards/rejected": -1.26555597782135, "step": 40 }, { "epoch": 0.32, "learning_rate": 4.3069871595684795e-06, "logits/chosen": -0.32125282287597656, "logits/rejected": -0.562100887298584, "logps/chosen": -3342.119873046875, "logps/rejected": -5389.53955078125, "loss": 0.4856, "rewards/accuracies": 0.546875, "rewards/chosen": -0.9807957410812378, "rewards/margins": 0.4719497263431549, "rewards/rejected": -1.4527455568313599, "step": 50 }, { "epoch": 0.38, "learning_rate": 3.8772424536302565e-06, "logits/chosen": -0.3323759138584137, "logits/rejected": -0.5304074883460999, "logps/chosen": -3480.371826171875, "logps/rejected": -4698.51025390625, "loss": 0.4883, "rewards/accuracies": 0.53125, "rewards/chosen": -0.97138512134552, "rewards/margins": 0.20621006190776825, "rewards/rejected": -1.1775951385498047, "step": 60 }, { "epoch": 0.45, "learning_rate": 3.3784370602033572e-06, "logits/chosen": -0.3379088044166565, "logits/rejected": -0.5837884545326233, "logps/chosen": -3488.02587890625, "logps/rejected": -5069.7412109375, "loss": 0.4871, "rewards/accuracies": 0.4937500059604645, "rewards/chosen": -1.0387831926345825, "rewards/margins": 0.1595762073993683, "rewards/rejected": -1.198359489440918, "step": 70 }, { "epoch": 0.51, "learning_rate": 2.835583164544139e-06, "logits/chosen": -0.2774398624897003, "logits/rejected": -0.5058914422988892, "logps/chosen": -2817.748779296875, "logps/rejected": -4369.88720703125, "loss": 0.4891, "rewards/accuracies": 0.48124998807907104, "rewards/chosen": -0.8934494853019714, "rewards/margins": 0.030101608484983444, "rewards/rejected": -0.9235512018203735, "step": 80 }, { "epoch": 0.58, "learning_rate": 2.2759017277414165e-06, "logits/chosen": -0.21915356814861298, "logits/rejected": -0.47689905762672424, "logps/chosen": -2961.93994140625, "logps/rejected": -4903.95166015625, "loss": 0.4839, "rewards/accuracies": 0.5531250238418579, "rewards/chosen": -0.7773372530937195, "rewards/margins": 0.3540068566799164, "rewards/rejected": -1.1313440799713135, "step": 90 }, { "epoch": 0.64, "learning_rate": 1.7274575140626318e-06, "logits/chosen": -0.3382042348384857, "logits/rejected": -0.6111919283866882, "logps/chosen": -2994.310302734375, "logps/rejected": -4617.9794921875, "loss": 0.4856, "rewards/accuracies": 0.4937500059604645, "rewards/chosen": -0.9103735089302063, "rewards/margins": 0.1410413533449173, "rewards/rejected": -1.051414966583252, "step": 100 }, { "epoch": 0.7, "learning_rate": 1.217751806485235e-06, "logits/chosen": -0.28220734000205994, "logits/rejected": -0.5584867000579834, "logps/chosen": -2910.51953125, "logps/rejected": -4760.12158203125, "loss": 0.4865, "rewards/accuracies": 0.503125011920929, "rewards/chosen": -0.8143825531005859, "rewards/margins": 0.21515560150146484, "rewards/rejected": -1.0295381546020508, "step": 110 }, { "epoch": 0.77, "learning_rate": 7.723433775328385e-07, "logits/chosen": -0.281380295753479, "logits/rejected": -0.500409722328186, "logps/chosen": -2965.12841796875, "logps/rejected": -4710.0673828125, "loss": 0.486, "rewards/accuracies": 0.5531250238418579, "rewards/chosen": -0.8462094068527222, "rewards/margins": 0.2777535617351532, "rewards/rejected": -1.1239629983901978, "step": 120 }, { "epoch": 0.83, "learning_rate": 4.1356686569674344e-07, "logits/chosen": -0.24006013572216034, "logits/rejected": -0.46842899918556213, "logps/chosen": -3051.799072265625, "logps/rejected": -4552.3515625, "loss": 0.4854, "rewards/accuracies": 0.546875, "rewards/chosen": -0.8304941058158875, "rewards/margins": 0.16177485883235931, "rewards/rejected": -0.9922689199447632, "step": 130 }, { "epoch": 0.9, "learning_rate": 1.59412823400657e-07, "logits/chosen": -0.2925623953342438, "logits/rejected": -0.4558919072151184, "logps/chosen": -3453.52880859375, "logps/rejected": -4216.4990234375, "loss": 0.4876, "rewards/accuracies": 0.4749999940395355, "rewards/chosen": -0.9790051579475403, "rewards/margins": -0.0291127972304821, "rewards/rejected": -0.9498924016952515, "step": 140 }, { "epoch": 0.96, "learning_rate": 2.262559558016325e-08, "logits/chosen": -0.29268592596054077, "logits/rejected": -0.5145989656448364, "logps/chosen": -2635.45703125, "logps/rejected": -4756.46875, "loss": 0.4888, "rewards/accuracies": 0.546875, "rewards/chosen": -0.7879113554954529, "rewards/margins": 0.24760587513446808, "rewards/rejected": -1.0355170965194702, "step": 150 }, { "epoch": 1.0, "step": 156, "total_flos": 0.0, "train_loss": 0.4880518607604198, "train_runtime": 4047.7642, "train_samples_per_second": 4.941, "train_steps_per_second": 0.039 } ], "logging_steps": 10, "max_steps": 156, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 100, "total_flos": 0.0, "train_batch_size": 4, "trial_name": null, "trial_params": null }