{ "best_metric": null, "best_model_checkpoint": null, "epoch": 2.988679245283019, "eval_steps": 500, "global_step": 396, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.01, "learning_rate": 1.25e-08, "logps/chosen": -22.472335815429688, "logps/rejected": -25.36812400817871, "loss": 0.6931, "losses/dpo": 0.6931471824645996, "losses/sft": 0.7711470723152161, "losses/total": 0.6931471824645996, "ref_logps/chosen": -22.472335815429688, "ref_logps/rejected": -25.36812400817871, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 1 }, { "epoch": 0.02, "learning_rate": 2.5e-08, "logps/chosen": -21.278339385986328, "logps/rejected": -25.130128860473633, "loss": 0.6931, "losses/dpo": 0.6931471824645996, "losses/sft": 0.8523496985435486, "losses/total": 0.6931471824645996, "ref_logps/chosen": -21.278339385986328, "ref_logps/rejected": -25.130128860473633, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 2 }, { "epoch": 0.02, "learning_rate": 3.75e-08, "logps/chosen": -21.53506851196289, "logps/rejected": -26.44188690185547, "loss": 0.693, "losses/dpo": 0.6928481459617615, "losses/sft": 0.6631997227668762, "losses/total": 0.6928481459617615, "ref_logps/chosen": -21.54958152770996, "ref_logps/rejected": -26.452028274536133, "rewards/accuracies": 0.515625, "rewards/chosen": 0.001451290212571621, "rewards/margins": 0.00043702672701328993, "rewards/rejected": 0.0010142631363123655, "step": 3 }, { "epoch": 0.03, "learning_rate": 5e-08, "logps/chosen": -21.846920013427734, "logps/rejected": -26.232192993164062, "loss": 0.6935, "losses/dpo": 0.6933612823486328, "losses/sft": 0.819932758808136, "losses/total": 0.6933612823486328, "ref_logps/chosen": -21.842269897460938, "ref_logps/rejected": -26.234174728393555, "rewards/accuracies": 0.4609375, "rewards/chosen": -0.00046504498459398746, "rewards/margins": -0.000663207727484405, "rewards/rejected": 0.0001981628010980785, "step": 4 }, { "epoch": 0.04, "learning_rate": 6.25e-08, "logps/chosen": -23.82025146484375, "logps/rejected": -26.558738708496094, "loss": 0.693, "losses/dpo": 0.6929464340209961, "losses/sft": 0.7624120712280273, "losses/total": 0.6929464340209961, "ref_logps/chosen": -23.817665100097656, "ref_logps/rejected": -26.55132293701172, "rewards/accuracies": 0.5234375, "rewards/chosen": -0.00025857496075332165, "rewards/margins": 0.0004831284750252962, "rewards/rejected": -0.0007417035521939397, "step": 5 }, { "epoch": 0.05, "learning_rate": 7.5e-08, "logps/chosen": -25.088871002197266, "logps/rejected": -29.653806686401367, "loss": 0.6923, "losses/dpo": 0.6934427618980408, "losses/sft": 0.7273141741752625, "losses/total": 0.6934427618980408, "ref_logps/chosen": -25.0992431640625, "ref_logps/rejected": -29.64551544189453, "rewards/accuracies": 0.6015625, "rewards/chosen": 0.0010370061499997973, "rewards/margins": 0.0018662326037883759, "rewards/rejected": -0.0008292265702039003, "step": 6 }, { "epoch": 0.05, "learning_rate": 8.75e-08, "logps/chosen": -23.075027465820312, "logps/rejected": -27.50556182861328, "loss": 0.693, "losses/dpo": 0.6948896646499634, "losses/sft": 0.6432714462280273, "losses/total": 0.6948896646499634, "ref_logps/chosen": -23.066946029663086, "ref_logps/rejected": -27.4930362701416, "rewards/accuracies": 0.4765625, "rewards/chosen": -0.0008082209387794137, "rewards/margins": 0.0004443599027581513, "rewards/rejected": -0.001252580899745226, "step": 7 }, { "epoch": 0.06, "learning_rate": 1e-07, "logps/chosen": -21.430335998535156, "logps/rejected": -29.949260711669922, "loss": 0.6933, "losses/dpo": 0.6911635398864746, "losses/sft": 0.8042243123054504, "losses/total": 0.6911635398864746, "ref_logps/chosen": -21.44394302368164, "ref_logps/rejected": -29.96406364440918, "rewards/accuracies": 0.4921875, "rewards/chosen": 0.0013606649590656161, "rewards/margins": -0.00011985772289335728, "rewards/rejected": 0.0014805227983742952, "step": 8 }, { "epoch": 0.07, "learning_rate": 1.125e-07, "logps/chosen": -23.053390502929688, "logps/rejected": -27.866111755371094, "loss": 0.6923, "losses/dpo": 0.6914368271827698, "losses/sft": 0.8787165284156799, "losses/total": 0.6914368271827698, "ref_logps/chosen": -23.060134887695312, "ref_logps/rejected": -27.85537338256836, "rewards/accuracies": 0.5546875, "rewards/chosen": 0.0006745259161107242, "rewards/margins": 0.0017485294956713915, "rewards/rejected": -0.0010740034049376845, "step": 9 }, { "epoch": 0.08, "learning_rate": 1.25e-07, "logps/chosen": -23.637466430664062, "logps/rejected": -29.587308883666992, "loss": 0.6922, "losses/dpo": 0.690066397190094, "losses/sft": 1.0419297218322754, "losses/total": 0.690066397190094, "ref_logps/chosen": -23.649028778076172, "ref_logps/rejected": -29.579374313354492, "rewards/accuracies": 0.546875, "rewards/chosen": 0.0011563875013962388, "rewards/margins": 0.0019498697947710752, "rewards/rejected": -0.0007934823515824974, "step": 10 }, { "epoch": 0.08, "learning_rate": 1.375e-07, "logps/chosen": -22.38899040222168, "logps/rejected": -24.971160888671875, "loss": 0.6926, "losses/dpo": 0.6951523423194885, "losses/sft": 0.9443475008010864, "losses/total": 0.6951523423194885, "ref_logps/chosen": -22.398780822753906, "ref_logps/rejected": -24.969751358032227, "rewards/accuracies": 0.5078125, "rewards/chosen": 0.000978996278718114, "rewards/margins": 0.0011200353037565947, "rewards/rejected": -0.00014103890862315893, "step": 11 }, { "epoch": 0.09, "learning_rate": 1.5e-07, "logps/chosen": -20.165252685546875, "logps/rejected": -26.619457244873047, "loss": 0.6946, "losses/dpo": 0.6987805962562561, "losses/sft": 0.876471221446991, "losses/total": 0.6987805962562561, "ref_logps/chosen": -20.14897918701172, "ref_logps/rejected": -26.63131332397461, "rewards/accuracies": 0.421875, "rewards/chosen": -0.001627539866603911, "rewards/margins": -0.002813115483149886, "rewards/rejected": 0.0011855755001306534, "step": 12 }, { "epoch": 0.1, "learning_rate": 1.625e-07, "logps/chosen": -25.07573699951172, "logps/rejected": -25.939855575561523, "loss": 0.6936, "losses/dpo": 0.6952416896820068, "losses/sft": 0.9322817325592041, "losses/total": 0.6952416896820068, "ref_logps/chosen": -25.075220108032227, "ref_logps/rejected": -25.947521209716797, "rewards/accuracies": 0.515625, "rewards/chosen": -5.185510963201523e-05, "rewards/margins": -0.0008183673489838839, "rewards/rejected": 0.0007665121229365468, "step": 13 }, { "epoch": 0.11, "learning_rate": 1.75e-07, "logps/chosen": -22.58213233947754, "logps/rejected": -27.590843200683594, "loss": 0.692, "losses/dpo": 0.6901522874832153, "losses/sft": 0.8234641551971436, "losses/total": 0.6901522874832153, "ref_logps/chosen": -22.58617401123047, "ref_logps/rejected": -27.570602416992188, "rewards/accuracies": 0.578125, "rewards/chosen": 0.00040407240157946944, "rewards/margins": 0.002427991945296526, "rewards/rejected": -0.0020239197183400393, "step": 14 }, { "epoch": 0.11, "learning_rate": 1.875e-07, "logps/chosen": -23.004196166992188, "logps/rejected": -25.858173370361328, "loss": 0.6925, "losses/dpo": 0.6923660039901733, "losses/sft": 0.7345502376556396, "losses/total": 0.6923660039901733, "ref_logps/chosen": -23.010601043701172, "ref_logps/rejected": -25.85067367553711, "rewards/accuracies": 0.515625, "rewards/chosen": 0.0006403709994629025, "rewards/margins": 0.0013903947547078133, "rewards/rejected": -0.0007500239298678935, "step": 15 }, { "epoch": 0.12, "learning_rate": 2e-07, "logps/chosen": -21.546062469482422, "logps/rejected": -25.777360916137695, "loss": 0.6931, "losses/dpo": 0.6901232004165649, "losses/sft": 0.8039647936820984, "losses/total": 0.6901232004165649, "ref_logps/chosen": -21.53840446472168, "ref_logps/rejected": -25.766767501831055, "rewards/accuracies": 0.5390625, "rewards/chosen": -0.0007656853413209319, "rewards/margins": 0.0002937153331004083, "rewards/rejected": -0.0010594006162136793, "step": 16 }, { "epoch": 0.13, "learning_rate": 2.1249999999999998e-07, "logps/chosen": -22.206989288330078, "logps/rejected": -27.877731323242188, "loss": 0.6937, "losses/dpo": 0.6932737827301025, "losses/sft": 0.7667961716651917, "losses/total": 0.6932737827301025, "ref_logps/chosen": -22.19771957397461, "ref_logps/rejected": -27.87958335876465, "rewards/accuracies": 0.4453125, "rewards/chosen": -0.0009270801674574614, "rewards/margins": -0.0011123311705887318, "rewards/rejected": 0.0001852509449236095, "step": 17 }, { "epoch": 0.14, "learning_rate": 2.25e-07, "logps/chosen": -21.215139389038086, "logps/rejected": -25.75381088256836, "loss": 0.693, "losses/dpo": 0.6932240724563599, "losses/sft": 0.736687421798706, "losses/total": 0.6932240724563599, "ref_logps/chosen": -21.212387084960938, "ref_logps/rejected": -25.746326446533203, "rewards/accuracies": 0.5, "rewards/chosen": -0.00027507508639246225, "rewards/margins": 0.000473553518531844, "rewards/rejected": -0.0007486287504434586, "step": 18 }, { "epoch": 0.14, "learning_rate": 2.3749999999999998e-07, "logps/chosen": -22.499832153320312, "logps/rejected": -26.145751953125, "loss": 0.6932, "losses/dpo": 0.6942628622055054, "losses/sft": 0.7466978430747986, "losses/total": 0.6942628622055054, "ref_logps/chosen": -22.496463775634766, "ref_logps/rejected": -26.141849517822266, "rewards/accuracies": 0.484375, "rewards/chosen": -0.0003368390607647598, "rewards/margins": 5.3280091378837824e-05, "rewards/rejected": -0.0003901191521435976, "step": 19 }, { "epoch": 0.15, "learning_rate": 2.5e-07, "logps/chosen": -21.5505428314209, "logps/rejected": -25.036113739013672, "loss": 0.693, "losses/dpo": 0.688271164894104, "losses/sft": 0.8725596070289612, "losses/total": 0.688271164894104, "ref_logps/chosen": -21.558109283447266, "ref_logps/rejected": -25.038726806640625, "rewards/accuracies": 0.5, "rewards/chosen": 0.0007566105341538787, "rewards/margins": 0.0004949538852088153, "rewards/rejected": 0.0002616568235680461, "step": 20 }, { "epoch": 0.16, "learning_rate": 2.625e-07, "logps/chosen": -21.649169921875, "logps/rejected": -25.741392135620117, "loss": 0.6917, "losses/dpo": 0.6939514875411987, "losses/sft": 0.7525328993797302, "losses/total": 0.6939514875411987, "ref_logps/chosen": -21.666126251220703, "ref_logps/rejected": -25.72817611694336, "rewards/accuracies": 0.640625, "rewards/chosen": 0.0016953760059550405, "rewards/margins": 0.003016936592757702, "rewards/rejected": -0.0013215603539720178, "step": 21 }, { "epoch": 0.17, "learning_rate": 2.75e-07, "logps/chosen": -21.422496795654297, "logps/rejected": -26.453773498535156, "loss": 0.695, "losses/dpo": 0.699163019657135, "losses/sft": 0.7248706221580505, "losses/total": 0.699163019657135, "ref_logps/chosen": -21.396032333374023, "ref_logps/rejected": -26.464006423950195, "rewards/accuracies": 0.4296875, "rewards/chosen": -0.002646287204697728, "rewards/margins": -0.003669553902000189, "rewards/rejected": 0.0010232668137177825, "step": 22 }, { "epoch": 0.17, "learning_rate": 2.8749999999999995e-07, "logps/chosen": -21.21988868713379, "logps/rejected": -25.13469886779785, "loss": 0.6929, "losses/dpo": 0.6908746957778931, "losses/sft": 0.7899657487869263, "losses/total": 0.6908746957778931, "ref_logps/chosen": -21.22311782836914, "ref_logps/rejected": -25.131580352783203, "rewards/accuracies": 0.515625, "rewards/chosen": 0.0003229643334634602, "rewards/margins": 0.0006348754977807403, "rewards/rejected": -0.00031191116431728005, "step": 23 }, { "epoch": 0.18, "learning_rate": 3e-07, "logps/chosen": -24.172225952148438, "logps/rejected": -27.93877410888672, "loss": 0.6936, "losses/dpo": 0.6931849718093872, "losses/sft": 0.7270597219467163, "losses/total": 0.6931849718093872, "ref_logps/chosen": -24.16461944580078, "ref_logps/rejected": -27.940391540527344, "rewards/accuracies": 0.4765625, "rewards/chosen": -0.000760397466365248, "rewards/margins": -0.0009220357751473784, "rewards/rejected": 0.00016163833788596094, "step": 24 }, { "epoch": 0.19, "learning_rate": 3.1249999999999997e-07, "logps/chosen": -23.023677825927734, "logps/rejected": -23.77918243408203, "loss": 0.6929, "losses/dpo": 0.6930486559867859, "losses/sft": 0.779391884803772, "losses/total": 0.6930486559867859, "ref_logps/chosen": -23.028684616088867, "ref_logps/rejected": -23.778560638427734, "rewards/accuracies": 0.546875, "rewards/chosen": 0.0005005812272429466, "rewards/margins": 0.0005629429360851645, "rewards/rejected": -6.23615924268961e-05, "step": 25 }, { "epoch": 0.2, "learning_rate": 3.25e-07, "logps/chosen": -24.240978240966797, "logps/rejected": -30.183570861816406, "loss": 0.6923, "losses/dpo": 0.6919558644294739, "losses/sft": 0.8828473091125488, "losses/total": 0.6919558644294739, "ref_logps/chosen": -24.253870010375977, "ref_logps/rejected": -30.17804718017578, "rewards/accuracies": 0.5078125, "rewards/chosen": 0.001289202249608934, "rewards/margins": 0.0018417320679873228, "rewards/rejected": -0.0005525298183783889, "step": 26 }, { "epoch": 0.2, "learning_rate": 3.375e-07, "logps/chosen": -22.371261596679688, "logps/rejected": -28.10503387451172, "loss": 0.6911, "losses/dpo": 0.6919010281562805, "losses/sft": 0.9361266493797302, "losses/total": 0.6919010281562805, "ref_logps/chosen": -22.4020938873291, "ref_logps/rejected": -28.094257354736328, "rewards/accuracies": 0.59375, "rewards/chosen": 0.0030831946060061455, "rewards/margins": 0.004160974640399218, "rewards/rejected": -0.001077780150808394, "step": 27 }, { "epoch": 0.21, "learning_rate": 3.5e-07, "logps/chosen": -21.107967376708984, "logps/rejected": -27.053752899169922, "loss": 0.6921, "losses/dpo": 0.6916664838790894, "losses/sft": 0.8491181135177612, "losses/total": 0.6916664838790894, "ref_logps/chosen": -21.1080379486084, "ref_logps/rejected": -27.03229331970215, "rewards/accuracies": 0.5390625, "rewards/chosen": 7.087946869432926e-06, "rewards/margins": 0.0021530785597860813, "rewards/rejected": -0.00214599072933197, "step": 28 }, { "epoch": 0.22, "learning_rate": 3.6249999999999997e-07, "logps/chosen": -23.424461364746094, "logps/rejected": -27.092483520507812, "loss": 0.6906, "losses/dpo": 0.6926239728927612, "losses/sft": 0.7789149284362793, "losses/total": 0.6926239728927612, "ref_logps/chosen": -23.46218490600586, "ref_logps/rejected": -27.07909393310547, "rewards/accuracies": 0.59375, "rewards/chosen": 0.0037725979927927256, "rewards/margins": 0.005111560225486755, "rewards/rejected": -0.0013389625819399953, "step": 29 }, { "epoch": 0.23, "learning_rate": 3.75e-07, "logps/chosen": -22.859556198120117, "logps/rejected": -27.201662063598633, "loss": 0.6933, "losses/dpo": 0.6948127746582031, "losses/sft": 0.7969105243682861, "losses/total": 0.6948127746582031, "ref_logps/chosen": -22.869096755981445, "ref_logps/rejected": -27.212430953979492, "rewards/accuracies": 0.515625, "rewards/chosen": 0.0009542852640151978, "rewards/margins": -0.00012272456660866737, "rewards/rejected": 0.001077009947039187, "step": 30 }, { "epoch": 0.23, "learning_rate": 3.875e-07, "logps/chosen": -22.666168212890625, "logps/rejected": -25.310596466064453, "loss": 0.6918, "losses/dpo": 0.6922581195831299, "losses/sft": 0.7759775519371033, "losses/total": 0.6922581195831299, "ref_logps/chosen": -22.68026351928711, "ref_logps/rejected": -25.297521591186523, "rewards/accuracies": 0.5625, "rewards/chosen": 0.001409594900906086, "rewards/margins": 0.002717201365157962, "rewards/rejected": -0.001307606347836554, "step": 31 }, { "epoch": 0.24, "learning_rate": 4e-07, "logps/chosen": -23.281084060668945, "logps/rejected": -28.84569549560547, "loss": 0.693, "losses/dpo": 0.6980300545692444, "losses/sft": 0.7636886835098267, "losses/total": 0.6980300545692444, "ref_logps/chosen": -23.299869537353516, "ref_logps/rejected": -28.859834671020508, "rewards/accuracies": 0.5390625, "rewards/chosen": 0.0018782642437145114, "rewards/margins": 0.000464284501504153, "rewards/rejected": 0.0014139798004180193, "step": 32 }, { "epoch": 0.25, "learning_rate": 4.1249999999999997e-07, "logps/chosen": -20.922544479370117, "logps/rejected": -27.139453887939453, "loss": 0.6914, "losses/dpo": 0.6892759799957275, "losses/sft": 0.7832686901092529, "losses/total": 0.6892759799957275, "ref_logps/chosen": -20.949806213378906, "ref_logps/rejected": -27.13178253173828, "rewards/accuracies": 0.6484375, "rewards/chosen": 0.0027261325158178806, "rewards/margins": 0.003493295982480049, "rewards/rejected": -0.0007671635248698294, "step": 33 }, { "epoch": 0.26, "learning_rate": 4.2499999999999995e-07, "logps/chosen": -22.535436630249023, "logps/rejected": -26.6143798828125, "loss": 0.6926, "losses/dpo": 0.6938276290893555, "losses/sft": 0.7895969152450562, "losses/total": 0.6938276290893555, "ref_logps/chosen": -22.540180206298828, "ref_logps/rejected": -26.607288360595703, "rewards/accuracies": 0.53125, "rewards/chosen": 0.0004745282931253314, "rewards/margins": 0.0011834825854748487, "rewards/rejected": -0.0007089540013112128, "step": 34 }, { "epoch": 0.26, "learning_rate": 4.375e-07, "logps/chosen": -21.444934844970703, "logps/rejected": -27.329378128051758, "loss": 0.6928, "losses/dpo": 0.6910836100578308, "losses/sft": 0.7998620271682739, "losses/total": 0.6910836100578308, "ref_logps/chosen": -21.460729598999023, "ref_logps/rejected": -27.336944580078125, "rewards/accuracies": 0.515625, "rewards/chosen": 0.0015797324012964964, "rewards/margins": 0.0008232423570007086, "rewards/rejected": 0.0007564900442957878, "step": 35 }, { "epoch": 0.27, "learning_rate": 4.5e-07, "logps/chosen": -22.847640991210938, "logps/rejected": -26.22686195373535, "loss": 0.6938, "losses/dpo": 0.6915764808654785, "losses/sft": 0.7927474975585938, "losses/total": 0.6915764808654785, "ref_logps/chosen": -22.84987449645996, "ref_logps/rejected": -26.240699768066406, "rewards/accuracies": 0.4375, "rewards/chosen": 0.00022311191423796117, "rewards/margins": -0.001160716055892408, "rewards/rejected": 0.0013838279992341995, "step": 36 }, { "epoch": 0.28, "learning_rate": 4.625e-07, "logps/chosen": -23.097599029541016, "logps/rejected": -25.179964065551758, "loss": 0.6925, "losses/dpo": 0.6903287768363953, "losses/sft": 0.8005999326705933, "losses/total": 0.6903287768363953, "ref_logps/chosen": -23.103515625, "ref_logps/rejected": -25.171833038330078, "rewards/accuracies": 0.4921875, "rewards/chosen": 0.000591703865211457, "rewards/margins": 0.001405149232596159, "rewards/rejected": -0.000813445309177041, "step": 37 }, { "epoch": 0.29, "learning_rate": 4.7499999999999995e-07, "logps/chosen": -23.07529640197754, "logps/rejected": -26.14615821838379, "loss": 0.6912, "losses/dpo": 0.6978300213813782, "losses/sft": 0.7380209565162659, "losses/total": 0.6978300213813782, "ref_logps/chosen": -23.113910675048828, "ref_logps/rejected": -26.144914627075195, "rewards/accuracies": 0.578125, "rewards/chosen": 0.003861566074192524, "rewards/margins": 0.003985891118645668, "rewards/rejected": -0.00012432527728378773, "step": 38 }, { "epoch": 0.29, "learning_rate": 4.875e-07, "logps/chosen": -23.091575622558594, "logps/rejected": -28.207073211669922, "loss": 0.6933, "losses/dpo": 0.6926023960113525, "losses/sft": 0.7966833710670471, "losses/total": 0.6926023960113525, "ref_logps/chosen": -23.102182388305664, "ref_logps/rejected": -28.21949577331543, "rewards/accuracies": 0.5234375, "rewards/chosen": 0.0010607184376567602, "rewards/margins": -0.00018126872600987554, "rewards/rejected": 0.0012419875711202621, "step": 39 }, { "epoch": 0.3, "learning_rate": 5e-07, "logps/chosen": -21.683151245117188, "logps/rejected": -27.111900329589844, "loss": 0.6903, "losses/dpo": 0.6866278648376465, "losses/sft": 0.887488842010498, "losses/total": 0.6866278648376465, "ref_logps/chosen": -21.714126586914062, "ref_logps/rejected": -27.08427619934082, "rewards/accuracies": 0.609375, "rewards/chosen": 0.003097555134445429, "rewards/margins": 0.0058600143529474735, "rewards/rejected": -0.0027624592185020447, "step": 40 }, { "epoch": 0.31, "learning_rate": 4.985955056179775e-07, "logps/chosen": -23.24443817138672, "logps/rejected": -24.057823181152344, "loss": 0.6924, "losses/dpo": 0.6903232336044312, "losses/sft": 0.7454457879066467, "losses/total": 0.6903232336044312, "ref_logps/chosen": -23.264373779296875, "ref_logps/rejected": -24.060710906982422, "rewards/accuracies": 0.4765625, "rewards/chosen": 0.0019934140145778656, "rewards/margins": 0.0017045673448592424, "rewards/rejected": 0.0002888469025492668, "step": 41 }, { "epoch": 0.32, "learning_rate": 4.97191011235955e-07, "logps/chosen": -22.751291275024414, "logps/rejected": -23.993690490722656, "loss": 0.692, "losses/dpo": 0.6917561292648315, "losses/sft": 0.8527467846870422, "losses/total": 0.6917561292648315, "ref_logps/chosen": -22.75712013244629, "ref_logps/rejected": -23.975711822509766, "rewards/accuracies": 0.5625, "rewards/chosen": 0.0005830166628584266, "rewards/margins": 0.0023808996193110943, "rewards/rejected": -0.0017978833056986332, "step": 42 }, { "epoch": 0.32, "learning_rate": 4.957865168539325e-07, "logps/chosen": -24.575613021850586, "logps/rejected": -27.22784996032715, "loss": 0.6922, "losses/dpo": 0.6912024021148682, "losses/sft": 0.8869270086288452, "losses/total": 0.6912024021148682, "ref_logps/chosen": -24.60643196105957, "ref_logps/rejected": -27.23748779296875, "rewards/accuracies": 0.5234375, "rewards/chosen": 0.0030817545484751463, "rewards/margins": 0.002117899712175131, "rewards/rejected": 0.0009638546616770327, "step": 43 }, { "epoch": 0.33, "learning_rate": 4.943820224719101e-07, "logps/chosen": -23.449739456176758, "logps/rejected": -29.683177947998047, "loss": 0.6913, "losses/dpo": 0.690817654132843, "losses/sft": 0.7518939971923828, "losses/total": 0.690817654132843, "ref_logps/chosen": -23.47886848449707, "ref_logps/rejected": -29.673599243164062, "rewards/accuracies": 0.5546875, "rewards/chosen": 0.0029130401089787483, "rewards/margins": 0.0038708222564309835, "rewards/rejected": -0.0009577819146215916, "step": 44 }, { "epoch": 0.34, "learning_rate": 4.929775280898877e-07, "logps/chosen": -21.53199577331543, "logps/rejected": -26.939178466796875, "loss": 0.6923, "losses/dpo": 0.6913425922393799, "losses/sft": 0.6940815448760986, "losses/total": 0.6913425922393799, "ref_logps/chosen": -21.567256927490234, "ref_logps/rejected": -26.955793380737305, "rewards/accuracies": 0.5078125, "rewards/chosen": 0.0035258703865110874, "rewards/margins": 0.0018642698414623737, "rewards/rejected": 0.0016616008942946792, "step": 45 }, { "epoch": 0.35, "learning_rate": 4.915730337078651e-07, "logps/chosen": -25.476314544677734, "logps/rejected": -28.62994956970215, "loss": 0.6903, "losses/dpo": 0.6909126043319702, "losses/sft": 0.9766503572463989, "losses/total": 0.6909126043319702, "ref_logps/chosen": -25.53481674194336, "ref_logps/rejected": -28.629175186157227, "rewards/accuracies": 0.5703125, "rewards/chosen": 0.005850302986800671, "rewards/margins": 0.005927846767008305, "rewards/rejected": -7.754407124593854e-05, "step": 46 }, { "epoch": 0.35, "learning_rate": 4.901685393258427e-07, "logps/chosen": -24.225303649902344, "logps/rejected": -25.871768951416016, "loss": 0.6919, "losses/dpo": 0.6921358704566956, "losses/sft": 0.8468361496925354, "losses/total": 0.6921358704566956, "ref_logps/chosen": -24.27114486694336, "ref_logps/rejected": -25.892141342163086, "rewards/accuracies": 0.5625, "rewards/chosen": 0.004584114067256451, "rewards/margins": 0.0025467565283179283, "rewards/rejected": 0.0020373575389385223, "step": 47 }, { "epoch": 0.36, "learning_rate": 4.887640449438202e-07, "logps/chosen": -21.586902618408203, "logps/rejected": -27.604530334472656, "loss": 0.6902, "losses/dpo": 0.6886686086654663, "losses/sft": 0.7169030904769897, "losses/total": 0.6886686086654663, "ref_logps/chosen": -21.621915817260742, "ref_logps/rejected": -27.578147888183594, "rewards/accuracies": 0.5703125, "rewards/chosen": 0.003501205239444971, "rewards/margins": 0.006139571778476238, "rewards/rejected": -0.00263836607336998, "step": 48 }, { "epoch": 0.37, "learning_rate": 4.873595505617978e-07, "logps/chosen": -24.600910186767578, "logps/rejected": -30.10862922668457, "loss": 0.6909, "losses/dpo": 0.6952996850013733, "losses/sft": 0.7813842296600342, "losses/total": 0.6952996850013733, "ref_logps/chosen": -24.644847869873047, "ref_logps/rejected": -30.104995727539062, "rewards/accuracies": 0.5859375, "rewards/chosen": 0.004394051153212786, "rewards/margins": 0.004757395945489407, "rewards/rejected": -0.0003633448213804513, "step": 49 }, { "epoch": 0.38, "learning_rate": 4.859550561797752e-07, "logps/chosen": -20.754308700561523, "logps/rejected": -24.876815795898438, "loss": 0.6914, "losses/dpo": 0.6889626979827881, "losses/sft": 0.8148602843284607, "losses/total": 0.6889626979827881, "ref_logps/chosen": -20.81591796875, "ref_logps/rejected": -24.90121078491211, "rewards/accuracies": 0.5, "rewards/chosen": 0.006161023862659931, "rewards/margins": 0.0037216043565422297, "rewards/rejected": 0.0024394195061177015, "step": 50 }, { "epoch": 0.38, "learning_rate": 4.845505617977528e-07, "logps/chosen": -23.585115432739258, "logps/rejected": -24.949783325195312, "loss": 0.6915, "losses/dpo": 0.6886854767799377, "losses/sft": 0.8582803010940552, "losses/total": 0.6886854767799377, "ref_logps/chosen": -23.63630485534668, "ref_logps/rejected": -24.965686798095703, "rewards/accuracies": 0.5546875, "rewards/chosen": 0.00511885154992342, "rewards/margins": 0.00352850160561502, "rewards/rejected": 0.0015903504099696875, "step": 51 }, { "epoch": 0.39, "learning_rate": 4.831460674157303e-07, "logps/chosen": -20.576318740844727, "logps/rejected": -24.87842559814453, "loss": 0.6916, "losses/dpo": 0.6899633407592773, "losses/sft": 0.6870510578155518, "losses/total": 0.6899633407592773, "ref_logps/chosen": -20.60286521911621, "ref_logps/rejected": -24.87299346923828, "rewards/accuracies": 0.5546875, "rewards/chosen": 0.0026545142754912376, "rewards/margins": 0.0031977419275790453, "rewards/rejected": -0.0005432275356724858, "step": 52 }, { "epoch": 0.4, "learning_rate": 4.817415730337078e-07, "logps/chosen": -24.051544189453125, "logps/rejected": -25.128353118896484, "loss": 0.6887, "losses/dpo": 0.6841185092926025, "losses/sft": 0.833280622959137, "losses/total": 0.6841185092926025, "ref_logps/chosen": -24.10638427734375, "ref_logps/rejected": -25.090627670288086, "rewards/accuracies": 0.6171875, "rewards/chosen": 0.005483907647430897, "rewards/margins": 0.009256447665393353, "rewards/rejected": -0.003772540483623743, "step": 53 }, { "epoch": 0.41, "learning_rate": 4.803370786516854e-07, "logps/chosen": -21.564958572387695, "logps/rejected": -26.20134735107422, "loss": 0.6914, "losses/dpo": 0.687272846698761, "losses/sft": 0.7218018770217896, "losses/total": 0.687272846698761, "ref_logps/chosen": -21.621246337890625, "ref_logps/rejected": -26.221445083618164, "rewards/accuracies": 0.546875, "rewards/chosen": 0.00562882237136364, "rewards/margins": 0.003618879709392786, "rewards/rejected": 0.002009942661970854, "step": 54 }, { "epoch": 0.42, "learning_rate": 4.789325842696629e-07, "logps/chosen": -23.699432373046875, "logps/rejected": -26.1567325592041, "loss": 0.6884, "losses/dpo": 0.6862033605575562, "losses/sft": 0.9426325559616089, "losses/total": 0.6862033605575562, "ref_logps/chosen": -23.775989532470703, "ref_logps/rejected": -26.134971618652344, "rewards/accuracies": 0.5859375, "rewards/chosen": 0.007655493449419737, "rewards/margins": 0.009831697680056095, "rewards/rejected": -0.0021762042306363583, "step": 55 }, { "epoch": 0.42, "learning_rate": 4.775280898876405e-07, "logps/chosen": -23.076374053955078, "logps/rejected": -27.695213317871094, "loss": 0.6881, "losses/dpo": 0.6900283098220825, "losses/sft": 0.8505688905715942, "losses/total": 0.6900283098220825, "ref_logps/chosen": -23.134254455566406, "ref_logps/rejected": -27.64853858947754, "rewards/accuracies": 0.6640625, "rewards/chosen": 0.005787987262010574, "rewards/margins": 0.010455346666276455, "rewards/rejected": -0.004667359404265881, "step": 56 }, { "epoch": 0.43, "learning_rate": 4.7612359550561797e-07, "logps/chosen": -21.54006576538086, "logps/rejected": -24.36727523803711, "loss": 0.6911, "losses/dpo": 0.6942879557609558, "losses/sft": 0.7311047911643982, "losses/total": 0.6942879557609558, "ref_logps/chosen": -21.592029571533203, "ref_logps/rejected": -24.37733268737793, "rewards/accuracies": 0.6015625, "rewards/chosen": 0.0051962630823254585, "rewards/margins": 0.004190489184111357, "rewards/rejected": 0.0010057740146294236, "step": 57 }, { "epoch": 0.44, "learning_rate": 4.747191011235955e-07, "logps/chosen": -21.678865432739258, "logps/rejected": -28.501548767089844, "loss": 0.6906, "losses/dpo": 0.6889323592185974, "losses/sft": 0.7590615749359131, "losses/total": 0.6889323592185974, "ref_logps/chosen": -21.72535514831543, "ref_logps/rejected": -28.493976593017578, "rewards/accuracies": 0.6328125, "rewards/chosen": 0.004648969508707523, "rewards/margins": 0.005405961070209742, "rewards/rejected": -0.0007569912704639137, "step": 58 }, { "epoch": 0.45, "learning_rate": 4.7331460674157303e-07, "logps/chosen": -23.9781436920166, "logps/rejected": -26.515047073364258, "loss": 0.684, "losses/dpo": 0.6820257902145386, "losses/sft": 0.8394409418106079, "losses/total": 0.6820257902145386, "ref_logps/chosen": -24.077434539794922, "ref_logps/rejected": -26.427589416503906, "rewards/accuracies": 0.6875, "rewards/chosen": 0.009928906336426735, "rewards/margins": 0.01867445930838585, "rewards/rejected": -0.008745552971959114, "step": 59 }, { "epoch": 0.45, "learning_rate": 4.7191011235955054e-07, "logps/chosen": -22.162433624267578, "logps/rejected": -30.391559600830078, "loss": 0.6894, "losses/dpo": 0.6909818053245544, "losses/sft": 0.7433596253395081, "losses/total": 0.6909818053245544, "ref_logps/chosen": -22.250009536743164, "ref_logps/rejected": -30.40111541748047, "rewards/accuracies": 0.5625, "rewards/chosen": 0.00875765923410654, "rewards/margins": 0.007802051026374102, "rewards/rejected": 0.0009556080331094563, "step": 60 }, { "epoch": 0.46, "learning_rate": 4.705056179775281e-07, "logps/chosen": -24.088329315185547, "logps/rejected": -26.851608276367188, "loss": 0.6888, "losses/dpo": 0.6858267188072205, "losses/sft": 0.6961312294006348, "losses/total": 0.6858267188072205, "ref_logps/chosen": -24.163042068481445, "ref_logps/rejected": -26.837688446044922, "rewards/accuracies": 0.625, "rewards/chosen": 0.007471038028597832, "rewards/margins": 0.008863050490617752, "rewards/rejected": -0.0013920125784352422, "step": 61 }, { "epoch": 0.47, "learning_rate": 4.691011235955056e-07, "logps/chosen": -23.13729476928711, "logps/rejected": -28.607454299926758, "loss": 0.6896, "losses/dpo": 0.6952353715896606, "losses/sft": 0.8425909280776978, "losses/total": 0.6952353715896606, "ref_logps/chosen": -23.206546783447266, "ref_logps/rejected": -28.603229522705078, "rewards/accuracies": 0.578125, "rewards/chosen": 0.006925276480615139, "rewards/margins": 0.007347787730395794, "rewards/rejected": -0.00042251107515767217, "step": 62 }, { "epoch": 0.48, "learning_rate": 4.6769662921348315e-07, "logps/chosen": -22.758800506591797, "logps/rejected": -25.503629684448242, "loss": 0.6882, "losses/dpo": 0.690306544303894, "losses/sft": 0.7292711734771729, "losses/total": 0.690306544303894, "ref_logps/chosen": -22.867115020751953, "ref_logps/rejected": -25.50885009765625, "rewards/accuracies": 0.640625, "rewards/chosen": 0.010831332765519619, "rewards/margins": 0.010309312492609024, "rewards/rejected": 0.0005220210296101868, "step": 63 }, { "epoch": 0.48, "learning_rate": 4.662921348314606e-07, "logps/chosen": -22.957290649414062, "logps/rejected": -27.15595245361328, "loss": 0.6868, "losses/dpo": 0.6876275539398193, "losses/sft": 0.9537997245788574, "losses/total": 0.6876275539398193, "ref_logps/chosen": -23.08481788635254, "ref_logps/rejected": -27.15395736694336, "rewards/accuracies": 0.6484375, "rewards/chosen": 0.012752560898661613, "rewards/margins": 0.012952261604368687, "rewards/rejected": -0.0001996997743844986, "step": 64 }, { "epoch": 0.49, "learning_rate": 4.6488764044943816e-07, "logps/chosen": -21.856212615966797, "logps/rejected": -28.90016746520996, "loss": 0.688, "losses/dpo": 0.6866365075111389, "losses/sft": 0.748786211013794, "losses/total": 0.6866365075111389, "ref_logps/chosen": -21.946701049804688, "ref_logps/rejected": -28.884090423583984, "rewards/accuracies": 0.5546875, "rewards/chosen": 0.009048780426383018, "rewards/margins": 0.010656429454684258, "rewards/rejected": -0.0016076482133939862, "step": 65 }, { "epoch": 0.5, "learning_rate": 4.634831460674157e-07, "logps/chosen": -21.727970123291016, "logps/rejected": -24.484195709228516, "loss": 0.6866, "losses/dpo": 0.6858303546905518, "losses/sft": 0.7428255677223206, "losses/total": 0.6858303546905518, "ref_logps/chosen": -21.827533721923828, "ref_logps/rejected": -24.44991111755371, "rewards/accuracies": 0.6484375, "rewards/chosen": 0.009956244379281998, "rewards/margins": 0.013384684920310974, "rewards/rejected": -0.0034284412395209074, "step": 66 }, { "epoch": 0.51, "learning_rate": 4.620786516853932e-07, "logps/chosen": -23.145030975341797, "logps/rejected": -25.03292465209961, "loss": 0.685, "losses/dpo": 0.6789939403533936, "losses/sft": 0.718001127243042, "losses/total": 0.6789939403533936, "ref_logps/chosen": -23.27937889099121, "ref_logps/rejected": -24.999483108520508, "rewards/accuracies": 0.671875, "rewards/chosen": 0.013434557244181633, "rewards/margins": 0.016778716817498207, "rewards/rejected": -0.0033441600389778614, "step": 67 }, { "epoch": 0.51, "learning_rate": 4.606741573033708e-07, "logps/chosen": -21.208370208740234, "logps/rejected": -25.74646759033203, "loss": 0.6852, "losses/dpo": 0.6921157836914062, "losses/sft": 0.8621765971183777, "losses/total": 0.6921157836914062, "ref_logps/chosen": -21.325489044189453, "ref_logps/rejected": -25.700342178344727, "rewards/accuracies": 0.640625, "rewards/chosen": 0.011711984872817993, "rewards/margins": 0.016324326395988464, "rewards/rejected": -0.0046123419888317585, "step": 68 }, { "epoch": 0.52, "learning_rate": 4.592696629213483e-07, "logps/chosen": -22.621421813964844, "logps/rejected": -28.81465721130371, "loss": 0.6885, "losses/dpo": 0.689292848110199, "losses/sft": 0.7215853929519653, "losses/total": 0.689292848110199, "ref_logps/chosen": -22.720117568969727, "ref_logps/rejected": -28.81524658203125, "rewards/accuracies": 0.609375, "rewards/chosen": 0.009869576431810856, "rewards/margins": 0.009810445830225945, "rewards/rejected": 5.913013592362404e-05, "step": 69 }, { "epoch": 0.53, "learning_rate": 4.5786516853932584e-07, "logps/chosen": -22.636703491210938, "logps/rejected": -28.595046997070312, "loss": 0.6872, "losses/dpo": 0.6876038312911987, "losses/sft": 0.7616434097290039, "losses/total": 0.6876038312911987, "ref_logps/chosen": -22.73769187927246, "ref_logps/rejected": -28.57284164428711, "rewards/accuracies": 0.6328125, "rewards/chosen": 0.01009867899119854, "rewards/margins": 0.01231930311769247, "rewards/rejected": -0.002220625290647149, "step": 70 }, { "epoch": 0.54, "learning_rate": 4.5646067415730334e-07, "logps/chosen": -23.055517196655273, "logps/rejected": -28.524490356445312, "loss": 0.6849, "losses/dpo": 0.6818934082984924, "losses/sft": 0.8828948736190796, "losses/total": 0.6818934082984924, "ref_logps/chosen": -23.18179702758789, "ref_logps/rejected": -28.479928970336914, "rewards/accuracies": 0.6796875, "rewards/chosen": 0.012628016993403435, "rewards/margins": 0.01708414778113365, "rewards/rejected": -0.004456131719052792, "step": 71 }, { "epoch": 0.54, "learning_rate": 4.550561797752809e-07, "logps/chosen": -25.802350997924805, "logps/rejected": -29.403223037719727, "loss": 0.682, "losses/dpo": 0.6922101974487305, "losses/sft": 0.7417640089988708, "losses/total": 0.6922101974487305, "ref_logps/chosen": -25.971485137939453, "ref_logps/rejected": -29.342666625976562, "rewards/accuracies": 0.6875, "rewards/chosen": 0.016913428902626038, "rewards/margins": 0.02296869084239006, "rewards/rejected": -0.006055259145796299, "step": 72 }, { "epoch": 0.55, "learning_rate": 4.536516853932584e-07, "logps/chosen": -22.979541778564453, "logps/rejected": -31.861392974853516, "loss": 0.6849, "losses/dpo": 0.6843876242637634, "losses/sft": 0.6335030198097229, "losses/total": 0.6843876242637634, "ref_logps/chosen": -23.086105346679688, "ref_logps/rejected": -31.796035766601562, "rewards/accuracies": 0.609375, "rewards/chosen": 0.010656386613845825, "rewards/margins": 0.017192194238305092, "rewards/rejected": -0.006535808090120554, "step": 73 }, { "epoch": 0.56, "learning_rate": 4.522471910112359e-07, "logps/chosen": -21.333240509033203, "logps/rejected": -25.32451629638672, "loss": 0.6842, "losses/dpo": 0.6832489967346191, "losses/sft": 0.8737274408340454, "losses/total": 0.6832489967346191, "ref_logps/chosen": -21.456268310546875, "ref_logps/rejected": -25.263538360595703, "rewards/accuracies": 0.703125, "rewards/chosen": 0.012302841059863567, "rewards/margins": 0.01840106211602688, "rewards/rejected": -0.006098220124840736, "step": 74 }, { "epoch": 0.57, "learning_rate": 4.5084269662921347e-07, "logps/chosen": -21.905548095703125, "logps/rejected": -25.504837036132812, "loss": 0.6845, "losses/dpo": 0.6803750991821289, "losses/sft": 0.7227590084075928, "losses/total": 0.6803750991821289, "ref_logps/chosen": -22.001012802124023, "ref_logps/rejected": -25.422731399536133, "rewards/accuracies": 0.6953125, "rewards/chosen": 0.009546317160129547, "rewards/margins": 0.01775689423084259, "rewards/rejected": -0.008210576139390469, "step": 75 }, { "epoch": 0.57, "learning_rate": 4.4943820224719097e-07, "logps/chosen": -22.212453842163086, "logps/rejected": -25.56966209411621, "loss": 0.6845, "losses/dpo": 0.6879241466522217, "losses/sft": 0.936349093914032, "losses/total": 0.6879241466522217, "ref_logps/chosen": -22.337223052978516, "ref_logps/rejected": -25.51331901550293, "rewards/accuracies": 0.625, "rewards/chosen": 0.012477071955800056, "rewards/margins": 0.018111376091837883, "rewards/rejected": -0.005634305067360401, "step": 76 }, { "epoch": 0.58, "learning_rate": 4.4803370786516853e-07, "logps/chosen": -20.199138641357422, "logps/rejected": -26.30996322631836, "loss": 0.6818, "losses/dpo": 0.6872521638870239, "losses/sft": 0.6872013211250305, "losses/total": 0.6872521638870239, "ref_logps/chosen": -20.368690490722656, "ref_logps/rejected": -26.24540138244629, "rewards/accuracies": 0.6875, "rewards/chosen": 0.016955075785517693, "rewards/margins": 0.02341129444539547, "rewards/rejected": -0.006456219125539064, "step": 77 }, { "epoch": 0.59, "learning_rate": 4.4662921348314603e-07, "logps/chosen": -22.031774520874023, "logps/rejected": -26.07961082458496, "loss": 0.6813, "losses/dpo": 0.6833238005638123, "losses/sft": 0.7775546312332153, "losses/total": 0.6833238005638123, "ref_logps/chosen": -22.163204193115234, "ref_logps/rejected": -25.965608596801758, "rewards/accuracies": 0.7109375, "rewards/chosen": 0.01314287818968296, "rewards/margins": 0.02454320341348648, "rewards/rejected": -0.011400324292480946, "step": 78 }, { "epoch": 0.6, "learning_rate": 4.452247191011236e-07, "logps/chosen": -22.522083282470703, "logps/rejected": -26.621906280517578, "loss": 0.6801, "losses/dpo": 0.6835525035858154, "losses/sft": 0.7558909058570862, "losses/total": 0.6835525035858154, "ref_logps/chosen": -22.656497955322266, "ref_logps/rejected": -26.48514747619629, "rewards/accuracies": 0.6875, "rewards/chosen": 0.013441269285976887, "rewards/margins": 0.027117114514112473, "rewards/rejected": -0.013675847090780735, "step": 79 }, { "epoch": 0.6, "learning_rate": 4.438202247191011e-07, "logps/chosen": -22.05775260925293, "logps/rejected": -26.428781509399414, "loss": 0.6836, "losses/dpo": 0.6767468452453613, "losses/sft": 0.8101401329040527, "losses/total": 0.6767468452453613, "ref_logps/chosen": -22.211511611938477, "ref_logps/rejected": -26.38385772705078, "rewards/accuracies": 0.59375, "rewards/chosen": 0.015375516377389431, "rewards/margins": 0.019868001341819763, "rewards/rejected": -0.004492484033107758, "step": 80 }, { "epoch": 0.61, "learning_rate": 4.4241573033707865e-07, "logps/chosen": -22.327136993408203, "logps/rejected": -27.90719985961914, "loss": 0.6803, "losses/dpo": 0.6830211281776428, "losses/sft": 0.7352213263511658, "losses/total": 0.6830211281776428, "ref_logps/chosen": -22.457595825195312, "ref_logps/rejected": -27.77078628540039, "rewards/accuracies": 0.6484375, "rewards/chosen": 0.013045946136116982, "rewards/margins": 0.026687312871217728, "rewards/rejected": -0.013641366735100746, "step": 81 }, { "epoch": 0.62, "learning_rate": 4.410112359550562e-07, "logps/chosen": -23.738140106201172, "logps/rejected": -26.4810791015625, "loss": 0.6802, "losses/dpo": 0.673937976360321, "losses/sft": 0.7962872385978699, "losses/total": 0.673937976360321, "ref_logps/chosen": -23.89459991455078, "ref_logps/rejected": -26.366804122924805, "rewards/accuracies": 0.65625, "rewards/chosen": 0.015645721927285194, "rewards/margins": 0.027073292061686516, "rewards/rejected": -0.011427570134401321, "step": 82 }, { "epoch": 0.63, "learning_rate": 4.3960674157303366e-07, "logps/chosen": -21.008014678955078, "logps/rejected": -24.34069061279297, "loss": 0.6805, "losses/dpo": 0.6789628863334656, "losses/sft": 0.9124815464019775, "losses/total": 0.6789628863334656, "ref_logps/chosen": -21.115734100341797, "ref_logps/rejected": -24.184371948242188, "rewards/accuracies": 0.65625, "rewards/chosen": 0.01077171228826046, "rewards/margins": 0.026403725147247314, "rewards/rejected": -0.015632012858986855, "step": 83 }, { "epoch": 0.63, "learning_rate": 4.382022471910112e-07, "logps/chosen": -20.62143325805664, "logps/rejected": -26.963245391845703, "loss": 0.6833, "losses/dpo": 0.6907744407653809, "losses/sft": 0.7639827728271484, "losses/total": 0.6907744407653809, "ref_logps/chosen": -20.739776611328125, "ref_logps/rejected": -26.87374496459961, "rewards/accuracies": 0.625, "rewards/chosen": 0.011834252625703812, "rewards/margins": 0.020784219726920128, "rewards/rejected": -0.008949968963861465, "step": 84 }, { "epoch": 0.64, "learning_rate": 4.367977528089887e-07, "logps/chosen": -21.591964721679688, "logps/rejected": -24.5494384765625, "loss": 0.6846, "losses/dpo": 0.6878204345703125, "losses/sft": 0.6917088627815247, "losses/total": 0.6878204345703125, "ref_logps/chosen": -21.644916534423828, "ref_logps/rejected": -24.421239852905273, "rewards/accuracies": 0.6328125, "rewards/chosen": 0.005295174196362495, "rewards/margins": 0.01811503805220127, "rewards/rejected": -0.012819863855838776, "step": 85 }, { "epoch": 0.65, "learning_rate": 4.353932584269663e-07, "logps/chosen": -24.759811401367188, "logps/rejected": -28.227123260498047, "loss": 0.6825, "losses/dpo": 0.6937445402145386, "losses/sft": 0.9424384832382202, "losses/total": 0.6937445402145386, "ref_logps/chosen": -24.891460418701172, "ref_logps/rejected": -28.136310577392578, "rewards/accuracies": 0.6484375, "rewards/chosen": 0.013164759613573551, "rewards/margins": 0.022245781496167183, "rewards/rejected": -0.009081022813916206, "step": 86 }, { "epoch": 0.66, "learning_rate": 4.339887640449438e-07, "logps/chosen": -22.8006591796875, "logps/rejected": -26.10009002685547, "loss": 0.6795, "losses/dpo": 0.6909404993057251, "losses/sft": 0.8603497743606567, "losses/total": 0.6909404993057251, "ref_logps/chosen": -22.96673583984375, "ref_logps/rejected": -25.977882385253906, "rewards/accuracies": 0.7109375, "rewards/chosen": 0.01660749316215515, "rewards/margins": 0.028828214854002, "rewards/rejected": -0.012220719829201698, "step": 87 }, { "epoch": 0.66, "learning_rate": 4.3258426966292134e-07, "logps/chosen": -24.15732765197754, "logps/rejected": -28.13039779663086, "loss": 0.6752, "losses/dpo": 0.6638558506965637, "losses/sft": 0.8455443382263184, "losses/total": 0.6638558506965637, "ref_logps/chosen": -24.341087341308594, "ref_logps/rejected": -27.938106536865234, "rewards/accuracies": 0.65625, "rewards/chosen": 0.018375899642705917, "rewards/margins": 0.03760489821434021, "rewards/rejected": -0.019228998571634293, "step": 88 }, { "epoch": 0.67, "learning_rate": 4.311797752808989e-07, "logps/chosen": -21.290430068969727, "logps/rejected": -25.207626342773438, "loss": 0.6771, "losses/dpo": 0.6774411797523499, "losses/sft": 0.9257520437240601, "losses/total": 0.6774411797523499, "ref_logps/chosen": -21.449438095092773, "ref_logps/rejected": -25.032873153686523, "rewards/accuracies": 0.65625, "rewards/chosen": 0.015900880098342896, "rewards/margins": 0.033375710248947144, "rewards/rejected": -0.017474830150604248, "step": 89 }, { "epoch": 0.68, "learning_rate": 4.297752808988764e-07, "logps/chosen": -24.241390228271484, "logps/rejected": -27.57483673095703, "loss": 0.681, "losses/dpo": 0.6869298219680786, "losses/sft": 0.8004887104034424, "losses/total": 0.6869298219680786, "ref_logps/chosen": -24.399887084960938, "ref_logps/rejected": -27.475460052490234, "rewards/accuracies": 0.640625, "rewards/chosen": 0.015849877148866653, "rewards/margins": 0.02578754723072052, "rewards/rejected": -0.009937671013176441, "step": 90 }, { "epoch": 0.69, "learning_rate": 4.2837078651685396e-07, "logps/chosen": -21.290605545043945, "logps/rejected": -25.188884735107422, "loss": 0.6843, "losses/dpo": 0.6896719336509705, "losses/sft": 0.7865870594978333, "losses/total": 0.6896719336509705, "ref_logps/chosen": -21.39483642578125, "ref_logps/rejected": -25.098407745361328, "rewards/accuracies": 0.6328125, "rewards/chosen": 0.010422902181744576, "rewards/margins": 0.01947084441781044, "rewards/rejected": -0.00904794316738844, "step": 91 }, { "epoch": 0.69, "learning_rate": 4.269662921348314e-07, "logps/chosen": -21.395389556884766, "logps/rejected": -24.005056381225586, "loss": 0.6863, "losses/dpo": 0.6820717453956604, "losses/sft": 0.8161361813545227, "losses/total": 0.6820717453956604, "ref_logps/chosen": -21.495037078857422, "ref_logps/rejected": -23.9505558013916, "rewards/accuracies": 0.53125, "rewards/chosen": 0.009964808821678162, "rewards/margins": 0.015414956025779247, "rewards/rejected": -0.00545014813542366, "step": 92 }, { "epoch": 0.7, "learning_rate": 4.2556179775280896e-07, "logps/chosen": -20.948806762695312, "logps/rejected": -24.735366821289062, "loss": 0.6786, "losses/dpo": 0.6868577599525452, "losses/sft": 0.7177249193191528, "losses/total": 0.6868577599525452, "ref_logps/chosen": -21.072193145751953, "ref_logps/rejected": -24.555286407470703, "rewards/accuracies": 0.6953125, "rewards/chosen": 0.012338603846728802, "rewards/margins": 0.030346699059009552, "rewards/rejected": -0.018008096143603325, "step": 93 }, { "epoch": 0.71, "learning_rate": 4.2415730337078647e-07, "logps/chosen": -24.245830535888672, "logps/rejected": -28.811023712158203, "loss": 0.6783, "losses/dpo": 0.6721839904785156, "losses/sft": 0.816402018070221, "losses/total": 0.6721839904785156, "ref_logps/chosen": -24.40906524658203, "ref_logps/rejected": -28.65966033935547, "rewards/accuracies": 0.703125, "rewards/chosen": 0.01632346771657467, "rewards/margins": 0.03146028146147728, "rewards/rejected": -0.01513681747019291, "step": 94 }, { "epoch": 0.72, "learning_rate": 4.22752808988764e-07, "logps/chosen": -22.48372459411621, "logps/rejected": -29.088359832763672, "loss": 0.6709, "losses/dpo": 0.6718644499778748, "losses/sft": 0.823063313961029, "losses/total": 0.6718644499778748, "ref_logps/chosen": -22.634136199951172, "ref_logps/rejected": -28.77379608154297, "rewards/accuracies": 0.7109375, "rewards/chosen": 0.01504128985106945, "rewards/margins": 0.04649777710437775, "rewards/rejected": -0.03145648539066315, "step": 95 }, { "epoch": 0.72, "learning_rate": 4.2134831460674153e-07, "logps/chosen": -20.869436264038086, "logps/rejected": -27.790451049804688, "loss": 0.6785, "losses/dpo": 0.6842025518417358, "losses/sft": 0.8330531120300293, "losses/total": 0.6842025518417358, "ref_logps/chosen": -20.964067459106445, "ref_logps/rejected": -27.572711944580078, "rewards/accuracies": 0.6484375, "rewards/chosen": 0.009463240392506123, "rewards/margins": 0.031237438321113586, "rewards/rejected": -0.02177419885993004, "step": 96 }, { "epoch": 0.73, "learning_rate": 4.199438202247191e-07, "logps/chosen": -22.02164077758789, "logps/rejected": -28.644880294799805, "loss": 0.6772, "losses/dpo": 0.6933009028434753, "losses/sft": 0.7342395186424255, "losses/total": 0.6933009028434753, "ref_logps/chosen": -22.146129608154297, "ref_logps/rejected": -28.42925262451172, "rewards/accuracies": 0.640625, "rewards/chosen": 0.012448801659047604, "rewards/margins": 0.03401148319244385, "rewards/rejected": -0.02156267873942852, "step": 97 }, { "epoch": 0.74, "learning_rate": 4.1853932584269664e-07, "logps/chosen": -21.086360931396484, "logps/rejected": -23.74181365966797, "loss": 0.6834, "losses/dpo": 0.7061095833778381, "losses/sft": 0.6976662278175354, "losses/total": 0.7061095833778381, "ref_logps/chosen": -21.240116119384766, "ref_logps/rejected": -23.683523178100586, "rewards/accuracies": 0.5625, "rewards/chosen": 0.015375564806163311, "rewards/margins": 0.021204624325037003, "rewards/rejected": -0.0058290609158575535, "step": 98 }, { "epoch": 0.75, "learning_rate": 4.1713483146067415e-07, "logps/chosen": -21.535640716552734, "logps/rejected": -28.555763244628906, "loss": 0.6749, "losses/dpo": 0.6546899080276489, "losses/sft": 0.8132616281509399, "losses/total": 0.6546899080276489, "ref_logps/chosen": -21.68370819091797, "ref_logps/rejected": -28.313589096069336, "rewards/accuracies": 0.6640625, "rewards/chosen": 0.014806646853685379, "rewards/margins": 0.03902393952012062, "rewards/rejected": -0.02421729266643524, "step": 99 }, { "epoch": 0.75, "learning_rate": 4.157303370786517e-07, "logps/chosen": -22.314010620117188, "logps/rejected": -26.403512954711914, "loss": 0.6777, "losses/dpo": 0.6830233931541443, "losses/sft": 0.7298552393913269, "losses/total": 0.6830233931541443, "ref_logps/chosen": -22.442527770996094, "ref_logps/rejected": -26.1983699798584, "rewards/accuracies": 0.625, "rewards/chosen": 0.012851729989051819, "rewards/margins": 0.033365827053785324, "rewards/rejected": -0.020514097064733505, "step": 100 }, { "epoch": 0.76, "learning_rate": 4.1432584269662915e-07, "logps/chosen": -23.65606117248535, "logps/rejected": -27.6639461517334, "loss": 0.6787, "losses/dpo": 0.66861492395401, "losses/sft": 0.7538549900054932, "losses/total": 0.66861492395401, "ref_logps/chosen": -23.742881774902344, "ref_logps/rejected": -27.43739128112793, "rewards/accuracies": 0.6328125, "rewards/chosen": 0.008682135492563248, "rewards/margins": 0.03133738413453102, "rewards/rejected": -0.022655250504612923, "step": 101 }, { "epoch": 0.77, "learning_rate": 4.129213483146067e-07, "logps/chosen": -21.20174789428711, "logps/rejected": -27.045516967773438, "loss": 0.6736, "losses/dpo": 0.6594799757003784, "losses/sft": 0.7625675201416016, "losses/total": 0.6594799757003784, "ref_logps/chosen": -21.360929489135742, "ref_logps/rejected": -26.788671493530273, "rewards/accuracies": 0.6640625, "rewards/chosen": 0.015918483957648277, "rewards/margins": 0.04160304740071297, "rewards/rejected": -0.02568456158041954, "step": 102 }, { "epoch": 0.78, "learning_rate": 4.115168539325842e-07, "logps/chosen": -25.287567138671875, "logps/rejected": -27.158187866210938, "loss": 0.6789, "losses/dpo": 0.6871756315231323, "losses/sft": 0.7897288799285889, "losses/total": 0.6871756315231323, "ref_logps/chosen": -25.39737319946289, "ref_logps/rejected": -26.95665740966797, "rewards/accuracies": 0.6015625, "rewards/chosen": 0.010980643332004547, "rewards/margins": 0.03113364614546299, "rewards/rejected": -0.020153000950813293, "step": 103 }, { "epoch": 0.78, "learning_rate": 4.1011235955056177e-07, "logps/chosen": -20.239051818847656, "logps/rejected": -27.055557250976562, "loss": 0.6766, "losses/dpo": 0.6560062170028687, "losses/sft": 0.7211654186248779, "losses/total": 0.6560062170028687, "ref_logps/chosen": -20.345287322998047, "ref_logps/rejected": -26.804546356201172, "rewards/accuracies": 0.640625, "rewards/chosen": 0.010623706504702568, "rewards/margins": 0.03572461009025574, "rewards/rejected": -0.02510090172290802, "step": 104 }, { "epoch": 0.79, "learning_rate": 4.0870786516853933e-07, "logps/chosen": -22.816429138183594, "logps/rejected": -28.331439971923828, "loss": 0.6728, "losses/dpo": 0.6975245475769043, "losses/sft": 0.8287545442581177, "losses/total": 0.6975245475769043, "ref_logps/chosen": -22.96261215209961, "ref_logps/rejected": -28.04006576538086, "rewards/accuracies": 0.7109375, "rewards/chosen": 0.014618270099163055, "rewards/margins": 0.043755702674388885, "rewards/rejected": -0.02913743630051613, "step": 105 }, { "epoch": 0.8, "learning_rate": 4.0730337078651683e-07, "logps/chosen": -22.864845275878906, "logps/rejected": -27.868162155151367, "loss": 0.6776, "losses/dpo": 0.6524635553359985, "losses/sft": 0.8967273235321045, "losses/total": 0.6524635553359985, "ref_logps/chosen": -22.934465408325195, "ref_logps/rejected": -27.60092544555664, "rewards/accuracies": 0.65625, "rewards/chosen": 0.006961943581700325, "rewards/margins": 0.03368568420410156, "rewards/rejected": -0.026723740622401237, "step": 106 }, { "epoch": 0.81, "learning_rate": 4.058988764044944e-07, "logps/chosen": -26.633420944213867, "logps/rejected": -29.40836524963379, "loss": 0.6785, "losses/dpo": 0.6883168816566467, "losses/sft": 0.9007142782211304, "losses/total": 0.6883168816566467, "ref_logps/chosen": -26.658733367919922, "ref_logps/rejected": -29.11638641357422, "rewards/accuracies": 0.625, "rewards/chosen": 0.0025312139187008142, "rewards/margins": 0.031729087233543396, "rewards/rejected": -0.029197873547673225, "step": 107 }, { "epoch": 0.82, "learning_rate": 4.044943820224719e-07, "logps/chosen": -21.93716049194336, "logps/rejected": -26.78734016418457, "loss": 0.6678, "losses/dpo": 0.6620572805404663, "losses/sft": 0.7277075052261353, "losses/total": 0.6620572805404663, "ref_logps/chosen": -22.14274787902832, "ref_logps/rejected": -26.450454711914062, "rewards/accuracies": 0.7265625, "rewards/chosen": 0.02055862732231617, "rewards/margins": 0.05424723029136658, "rewards/rejected": -0.03368859738111496, "step": 108 }, { "epoch": 0.82, "learning_rate": 4.0308988764044945e-07, "logps/chosen": -23.479236602783203, "logps/rejected": -25.321468353271484, "loss": 0.6732, "losses/dpo": 0.6536136865615845, "losses/sft": 0.793202817440033, "losses/total": 0.6536136865615845, "ref_logps/chosen": -23.628402709960938, "ref_logps/rejected": -25.03476905822754, "rewards/accuracies": 0.6171875, "rewards/chosen": 0.014916517771780491, "rewards/margins": 0.043586596846580505, "rewards/rejected": -0.02867007628083229, "step": 109 }, { "epoch": 0.83, "learning_rate": 4.0168539325842696e-07, "logps/chosen": -21.36187744140625, "logps/rejected": -26.808046340942383, "loss": 0.6677, "losses/dpo": 0.658541202545166, "losses/sft": 0.6240718364715576, "losses/total": 0.658541202545166, "ref_logps/chosen": -21.525625228881836, "ref_logps/rejected": -26.417198181152344, "rewards/accuracies": 0.640625, "rewards/chosen": 0.01637459173798561, "rewards/margins": 0.05545924976468086, "rewards/rejected": -0.03908466175198555, "step": 110 }, { "epoch": 0.84, "learning_rate": 4.0028089887640446e-07, "logps/chosen": -22.143728256225586, "logps/rejected": -26.035858154296875, "loss": 0.6732, "losses/dpo": 0.6707695126533508, "losses/sft": 0.8353971838951111, "losses/total": 0.6707695126533508, "ref_logps/chosen": -22.292274475097656, "ref_logps/rejected": -25.744632720947266, "rewards/accuracies": 0.6484375, "rewards/chosen": 0.014854478649795055, "rewards/margins": 0.043976958841085434, "rewards/rejected": -0.029122481122612953, "step": 111 }, { "epoch": 0.85, "learning_rate": 3.9887640449438196e-07, "logps/chosen": -22.15041732788086, "logps/rejected": -24.53826332092285, "loss": 0.6688, "losses/dpo": 0.6656994819641113, "losses/sft": 0.8727293014526367, "losses/total": 0.6656994819641113, "ref_logps/chosen": -22.210494995117188, "ref_logps/rejected": -24.07231330871582, "rewards/accuracies": 0.7109375, "rewards/chosen": 0.0060077933594584465, "rewards/margins": 0.05260289087891579, "rewards/rejected": -0.046595096588134766, "step": 112 }, { "epoch": 0.85, "learning_rate": 3.974719101123595e-07, "logps/chosen": -23.314592361450195, "logps/rejected": -27.797752380371094, "loss": 0.675, "losses/dpo": 0.6690158247947693, "losses/sft": 0.7370929718017578, "losses/total": 0.6690158247947693, "ref_logps/chosen": -23.396080017089844, "ref_logps/rejected": -27.483016967773438, "rewards/accuracies": 0.609375, "rewards/chosen": 0.008148876950144768, "rewards/margins": 0.03962232545018196, "rewards/rejected": -0.031473446637392044, "step": 113 }, { "epoch": 0.86, "learning_rate": 3.960674157303371e-07, "logps/chosen": -21.854373931884766, "logps/rejected": -26.652328491210938, "loss": 0.6706, "losses/dpo": 0.645140528678894, "losses/sft": 0.77164226770401, "losses/total": 0.645140528678894, "ref_logps/chosen": -21.949893951416016, "ref_logps/rejected": -26.255746841430664, "rewards/accuracies": 0.6484375, "rewards/chosen": 0.009551877155900002, "rewards/margins": 0.0492100827395916, "rewards/rejected": -0.03965820372104645, "step": 114 }, { "epoch": 0.87, "learning_rate": 3.946629213483146e-07, "logps/chosen": -23.778413772583008, "logps/rejected": -28.40381622314453, "loss": 0.6634, "losses/dpo": 0.6699668169021606, "losses/sft": 0.8002771139144897, "losses/total": 0.6699668169021606, "ref_logps/chosen": -23.903501510620117, "ref_logps/rejected": -27.89557647705078, "rewards/accuracies": 0.71875, "rewards/chosen": 0.0125090591609478, "rewards/margins": 0.06333282589912415, "rewards/rejected": -0.050823770463466644, "step": 115 }, { "epoch": 0.88, "learning_rate": 3.9325842696629214e-07, "logps/chosen": -24.59353256225586, "logps/rejected": -28.184139251708984, "loss": 0.6658, "losses/dpo": 0.6745936870574951, "losses/sft": 0.8017398715019226, "losses/total": 0.6745936870574951, "ref_logps/chosen": -24.77825164794922, "ref_logps/rejected": -27.77378273010254, "rewards/accuracies": 0.671875, "rewards/chosen": 0.018472209572792053, "rewards/margins": 0.059507861733436584, "rewards/rejected": -0.04103565216064453, "step": 116 }, { "epoch": 0.88, "learning_rate": 3.9185393258426964e-07, "logps/chosen": -20.781490325927734, "logps/rejected": -25.704240798950195, "loss": 0.6641, "losses/dpo": 0.6748782396316528, "losses/sft": 0.6509857177734375, "losses/total": 0.6748782396316528, "ref_logps/chosen": -20.93104362487793, "ref_logps/rejected": -25.223262786865234, "rewards/accuracies": 0.703125, "rewards/chosen": 0.014955190010368824, "rewards/margins": 0.06305292248725891, "rewards/rejected": -0.04809773340821266, "step": 117 }, { "epoch": 0.89, "learning_rate": 3.904494382022472e-07, "logps/chosen": -22.889171600341797, "logps/rejected": -28.954145431518555, "loss": 0.6719, "losses/dpo": 0.6790695190429688, "losses/sft": 0.7899962663650513, "losses/total": 0.6790695190429688, "ref_logps/chosen": -22.998294830322266, "ref_logps/rejected": -28.596576690673828, "rewards/accuracies": 0.640625, "rewards/chosen": 0.010911967605352402, "rewards/margins": 0.04666893184185028, "rewards/rejected": -0.03575696796178818, "step": 118 }, { "epoch": 0.9, "learning_rate": 3.890449438202247e-07, "logps/chosen": -22.229143142700195, "logps/rejected": -24.892658233642578, "loss": 0.6713, "losses/dpo": 0.6665077209472656, "losses/sft": 0.8753491044044495, "losses/total": 0.6665077209472656, "ref_logps/chosen": -22.402416229248047, "ref_logps/rejected": -24.568809509277344, "rewards/accuracies": 0.6484375, "rewards/chosen": 0.01732712611556053, "rewards/margins": 0.0497119314968586, "rewards/rejected": -0.032384805381298065, "step": 119 }, { "epoch": 0.91, "learning_rate": 3.876404494382022e-07, "logps/chosen": -22.233783721923828, "logps/rejected": -29.53872299194336, "loss": 0.6637, "losses/dpo": 0.6545946002006531, "losses/sft": 0.8056938052177429, "losses/total": 0.6545946002006531, "ref_logps/chosen": -22.328821182250977, "ref_logps/rejected": -28.996824264526367, "rewards/accuracies": 0.7265625, "rewards/chosen": 0.009503833949565887, "rewards/margins": 0.06369376927614212, "rewards/rejected": -0.05418993532657623, "step": 120 }, { "epoch": 0.91, "learning_rate": 3.8623595505617977e-07, "logps/chosen": -24.073867797851562, "logps/rejected": -27.632476806640625, "loss": 0.6778, "losses/dpo": 0.6500009298324585, "losses/sft": 0.9210071563720703, "losses/total": 0.6500009298324585, "ref_logps/chosen": -24.12955093383789, "ref_logps/rejected": -27.32662582397461, "rewards/accuracies": 0.5859375, "rewards/chosen": 0.00556858628988266, "rewards/margins": 0.03615354374051094, "rewards/rejected": -0.03058495745062828, "step": 121 }, { "epoch": 0.92, "learning_rate": 3.8483146067415727e-07, "logps/chosen": -21.38442039489746, "logps/rejected": -31.358665466308594, "loss": 0.6601, "losses/dpo": 0.6630659103393555, "losses/sft": 0.8758641481399536, "losses/total": 0.6630659103393555, "ref_logps/chosen": -21.540292739868164, "ref_logps/rejected": -30.79846954345703, "rewards/accuracies": 0.703125, "rewards/chosen": 0.01558714546263218, "rewards/margins": 0.07160677015781403, "rewards/rejected": -0.056019626557826996, "step": 122 }, { "epoch": 0.93, "learning_rate": 3.834269662921348e-07, "logps/chosen": -21.09262466430664, "logps/rejected": -25.64166831970215, "loss": 0.6622, "losses/dpo": 0.6400080919265747, "losses/sft": 0.8849148750305176, "losses/total": 0.6400080919265747, "ref_logps/chosen": -21.179445266723633, "ref_logps/rejected": -25.056869506835938, "rewards/accuracies": 0.6796875, "rewards/chosen": 0.00868179090321064, "rewards/margins": 0.06716156005859375, "rewards/rejected": -0.05847976729273796, "step": 123 }, { "epoch": 0.94, "learning_rate": 3.8202247191011233e-07, "logps/chosen": -25.65859603881836, "logps/rejected": -28.025104522705078, "loss": 0.6765, "losses/dpo": 0.6927012205123901, "losses/sft": 0.8673559427261353, "losses/total": 0.6927012205123901, "ref_logps/chosen": -25.61608123779297, "ref_logps/rejected": -27.600624084472656, "rewards/accuracies": 0.59375, "rewards/chosen": -0.004251426085829735, "rewards/margins": 0.038196537643671036, "rewards/rejected": -0.04244796186685562, "step": 124 }, { "epoch": 0.94, "learning_rate": 3.806179775280899e-07, "logps/chosen": -23.93341636657715, "logps/rejected": -29.840375900268555, "loss": 0.6647, "losses/dpo": 0.7150436639785767, "losses/sft": 0.9468034505844116, "losses/total": 0.7150436639785767, "ref_logps/chosen": -23.979652404785156, "ref_logps/rejected": -29.25320816040039, "rewards/accuracies": 0.703125, "rewards/chosen": 0.004623853601515293, "rewards/margins": 0.06334076821804047, "rewards/rejected": -0.058716922998428345, "step": 125 }, { "epoch": 0.95, "learning_rate": 3.792134831460674e-07, "logps/chosen": -25.031259536743164, "logps/rejected": -28.292198181152344, "loss": 0.6559, "losses/dpo": 0.6770719289779663, "losses/sft": 0.9255229234695435, "losses/total": 0.6770719289779663, "ref_logps/chosen": -25.22754669189453, "ref_logps/rejected": -27.667905807495117, "rewards/accuracies": 0.7265625, "rewards/chosen": 0.019628863781690598, "rewards/margins": 0.08205802738666534, "rewards/rejected": -0.06242916360497475, "step": 126 }, { "epoch": 0.96, "learning_rate": 3.7780898876404495e-07, "logps/chosen": -21.68558692932129, "logps/rejected": -26.84676742553711, "loss": 0.6765, "losses/dpo": 0.635480523109436, "losses/sft": 0.7413178086280823, "losses/total": 0.635480523109436, "ref_logps/chosen": -21.638694763183594, "ref_logps/rejected": -26.392860412597656, "rewards/accuracies": 0.5390625, "rewards/chosen": -0.004689330700784922, "rewards/margins": 0.04070135951042175, "rewards/rejected": -0.04539068788290024, "step": 127 }, { "epoch": 0.97, "learning_rate": 3.7640449438202245e-07, "logps/chosen": -22.910152435302734, "logps/rejected": -26.53976821899414, "loss": 0.6587, "losses/dpo": 0.6835530400276184, "losses/sft": 0.9732310771942139, "losses/total": 0.6835530400276184, "ref_logps/chosen": -23.018016815185547, "ref_logps/rejected": -25.88375473022461, "rewards/accuracies": 0.703125, "rewards/chosen": 0.010786494240164757, "rewards/margins": 0.07638738304376602, "rewards/rejected": -0.06560088694095612, "step": 128 }, { "epoch": 0.97, "learning_rate": 3.75e-07, "logps/chosen": -23.20888900756836, "logps/rejected": -26.875211715698242, "loss": 0.6617, "losses/dpo": 0.6463422775268555, "losses/sft": 0.7454620599746704, "losses/total": 0.6463422775268555, "ref_logps/chosen": -23.336442947387695, "ref_logps/rejected": -26.300058364868164, "rewards/accuracies": 0.6875, "rewards/chosen": 0.012755412608385086, "rewards/margins": 0.0702708438038826, "rewards/rejected": -0.057515427470207214, "step": 129 }, { "epoch": 0.98, "learning_rate": 3.735955056179775e-07, "logps/chosen": -22.396747589111328, "logps/rejected": -29.472164154052734, "loss": 0.6784, "losses/dpo": 0.6625787019729614, "losses/sft": 0.7854889631271362, "losses/total": 0.6625787019729614, "ref_logps/chosen": -22.277332305908203, "ref_logps/rejected": -28.998043060302734, "rewards/accuracies": 0.6328125, "rewards/chosen": -0.01194157637655735, "rewards/margins": 0.03547064587473869, "rewards/rejected": -0.047412216663360596, "step": 130 }, { "epoch": 0.99, "learning_rate": 3.72191011235955e-07, "logps/chosen": -18.81739044189453, "logps/rejected": -24.600296020507812, "loss": 0.6612, "losses/dpo": 0.6598723530769348, "losses/sft": 0.8644169569015503, "losses/total": 0.6598723530769348, "ref_logps/chosen": -18.89391326904297, "ref_logps/rejected": -23.96309471130371, "rewards/accuracies": 0.671875, "rewards/chosen": 0.0076522137969732285, "rewards/margins": 0.07137227803468704, "rewards/rejected": -0.06372006982564926, "step": 131 }, { "epoch": 1.0, "learning_rate": 3.707865168539326e-07, "logps/chosen": -25.24700927734375, "logps/rejected": -29.2607364654541, "loss": 0.6576, "losses/dpo": 0.6264052391052246, "losses/sft": 0.7484258413314819, "losses/total": 0.6264052391052246, "ref_logps/chosen": -25.243091583251953, "ref_logps/rejected": -28.458097457885742, "rewards/accuracies": 0.65625, "rewards/chosen": -0.00039180926978588104, "rewards/margins": 0.07987209409475327, "rewards/rejected": -0.0802639052271843, "step": 132 }, { "epoch": 1.0, "learning_rate": 3.693820224719101e-07, "logps/chosen": -24.664264678955078, "logps/rejected": -29.071331024169922, "loss": 0.6596, "losses/dpo": 0.6850643157958984, "losses/sft": 0.7063156366348267, "losses/total": 0.6850643157958984, "ref_logps/chosen": -24.58011245727539, "ref_logps/rejected": -28.208541870117188, "rewards/accuracies": 0.71875, "rewards/chosen": -0.008415229618549347, "rewards/margins": 0.07786377519369125, "rewards/rejected": -0.08627899736166, "step": 133 }, { "epoch": 1.01, "learning_rate": 3.6797752808988764e-07, "logps/chosen": -21.803192138671875, "logps/rejected": -25.79207992553711, "loss": 0.6529, "losses/dpo": 0.6567816734313965, "losses/sft": 0.8528650403022766, "losses/total": 0.6567816734313965, "ref_logps/chosen": -21.88966941833496, "ref_logps/rejected": -24.97705841064453, "rewards/accuracies": 0.6875, "rewards/chosen": 0.00864771381020546, "rewards/margins": 0.0901501327753067, "rewards/rejected": -0.08150242269039154, "step": 134 }, { "epoch": 1.02, "learning_rate": 3.6657303370786514e-07, "logps/chosen": -20.78626823425293, "logps/rejected": -27.048810958862305, "loss": 0.6442, "losses/dpo": 0.6402660608291626, "losses/sft": 0.7653439044952393, "losses/total": 0.6402660608291626, "ref_logps/chosen": -20.915481567382812, "ref_logps/rejected": -26.105587005615234, "rewards/accuracies": 0.765625, "rewards/chosen": 0.012921325862407684, "rewards/margins": 0.10724389553070068, "rewards/rejected": -0.0943225771188736, "step": 135 }, { "epoch": 1.03, "learning_rate": 3.651685393258427e-07, "logps/chosen": -23.661598205566406, "logps/rejected": -26.884532928466797, "loss": 0.6563, "losses/dpo": 0.6588989496231079, "losses/sft": 0.8334387540817261, "losses/total": 0.6588989496231079, "ref_logps/chosen": -23.68170166015625, "ref_logps/rejected": -26.042449951171875, "rewards/accuracies": 0.671875, "rewards/chosen": 0.0020102611742913723, "rewards/margins": 0.08621874451637268, "rewards/rejected": -0.08420848101377487, "step": 136 }, { "epoch": 1.03, "learning_rate": 3.637640449438202e-07, "logps/chosen": -21.846914291381836, "logps/rejected": -26.843595504760742, "loss": 0.6414, "losses/dpo": 0.610801100730896, "losses/sft": 0.6104759573936462, "losses/total": 0.610801100730896, "ref_logps/chosen": -21.904037475585938, "ref_logps/rejected": -25.758628845214844, "rewards/accuracies": 0.7734375, "rewards/chosen": 0.005712391808629036, "rewards/margins": 0.11420895159244537, "rewards/rejected": -0.10849656164646149, "step": 137 }, { "epoch": 1.04, "learning_rate": 3.6235955056179776e-07, "logps/chosen": -23.79953384399414, "logps/rejected": -26.24932861328125, "loss": 0.6507, "losses/dpo": 0.6711180806159973, "losses/sft": 0.8334028720855713, "losses/total": 0.6711180806159973, "ref_logps/chosen": -23.89289093017578, "ref_logps/rejected": -25.393817901611328, "rewards/accuracies": 0.6953125, "rewards/chosen": 0.009335671551525593, "rewards/margins": 0.09488671272993088, "rewards/rejected": -0.08555103838443756, "step": 138 }, { "epoch": 1.05, "learning_rate": 3.6095505617977526e-07, "logps/chosen": -20.413612365722656, "logps/rejected": -28.091732025146484, "loss": 0.6393, "losses/dpo": 0.6086191534996033, "losses/sft": 0.7045127749443054, "losses/total": 0.6086191534996033, "ref_logps/chosen": -20.591529846191406, "ref_logps/rejected": -27.054677963256836, "rewards/accuracies": 0.703125, "rewards/chosen": 0.0177919864654541, "rewards/margins": 0.12149728834629059, "rewards/rejected": -0.10370529443025589, "step": 139 }, { "epoch": 1.06, "learning_rate": 3.5955056179775277e-07, "logps/chosen": -23.96946907043457, "logps/rejected": -25.42624282836914, "loss": 0.6574, "losses/dpo": 0.6771029233932495, "losses/sft": 0.8275946378707886, "losses/total": 0.6771029233932495, "ref_logps/chosen": -23.84187126159668, "ref_logps/rejected": -24.474294662475586, "rewards/accuracies": 0.703125, "rewards/chosen": -0.012759597972035408, "rewards/margins": 0.0824354737997055, "rewards/rejected": -0.09519506990909576, "step": 140 }, { "epoch": 1.06, "learning_rate": 3.581460674157303e-07, "logps/chosen": -20.24493980407715, "logps/rejected": -26.33192253112793, "loss": 0.6403, "losses/dpo": 0.60587477684021, "losses/sft": 0.7718257904052734, "losses/total": 0.60587477684021, "ref_logps/chosen": -20.375638961791992, "ref_logps/rejected": -25.299020767211914, "rewards/accuracies": 0.75, "rewards/chosen": 0.013069930486381054, "rewards/margins": 0.11636004596948624, "rewards/rejected": -0.10329011082649231, "step": 141 }, { "epoch": 1.07, "learning_rate": 3.5674157303370783e-07, "logps/chosen": -22.9414119720459, "logps/rejected": -28.200380325317383, "loss": 0.6384, "losses/dpo": 0.6827423572540283, "losses/sft": 0.8567611575126648, "losses/total": 0.6827423572540283, "ref_logps/chosen": -23.111347198486328, "ref_logps/rejected": -27.142616271972656, "rewards/accuracies": 0.7421875, "rewards/chosen": 0.01699351891875267, "rewards/margins": 0.12276984751224518, "rewards/rejected": -0.10577632486820221, "step": 142 }, { "epoch": 1.08, "learning_rate": 3.553370786516854e-07, "logps/chosen": -23.226070404052734, "logps/rejected": -27.77198028564453, "loss": 0.6624, "losses/dpo": 0.6864386796951294, "losses/sft": 0.8041479587554932, "losses/total": 0.6864386796951294, "ref_logps/chosen": -22.92740249633789, "ref_logps/rejected": -26.72946548461914, "rewards/accuracies": 0.625, "rewards/chosen": -0.029866419732570648, "rewards/margins": 0.07438516616821289, "rewards/rejected": -0.10425157845020294, "step": 143 }, { "epoch": 1.09, "learning_rate": 3.539325842696629e-07, "logps/chosen": -21.75617027282715, "logps/rejected": -28.53704833984375, "loss": 0.6455, "losses/dpo": 0.6347097158432007, "losses/sft": 0.6569658517837524, "losses/total": 0.6347097158432007, "ref_logps/chosen": -21.872474670410156, "ref_logps/rejected": -27.540082931518555, "rewards/accuracies": 0.7109375, "rewards/chosen": 0.011630430817604065, "rewards/margins": 0.11132718622684479, "rewards/rejected": -0.09969674795866013, "step": 144 }, { "epoch": 1.09, "learning_rate": 3.5252808988764045e-07, "logps/chosen": -24.18975830078125, "logps/rejected": -29.736862182617188, "loss": 0.6407, "losses/dpo": 0.6530706286430359, "losses/sft": 0.8703383207321167, "losses/total": 0.6530706286430359, "ref_logps/chosen": -24.138484954833984, "ref_logps/rejected": -28.495933532714844, "rewards/accuracies": 0.71875, "rewards/chosen": -0.005126964300870895, "rewards/margins": 0.11896562576293945, "rewards/rejected": -0.12409258633852005, "step": 145 }, { "epoch": 1.1, "learning_rate": 3.51123595505618e-07, "logps/chosen": -24.84428596496582, "logps/rejected": -29.576303482055664, "loss": 0.647, "losses/dpo": 0.6477080583572388, "losses/sft": 0.8653473854064941, "losses/total": 0.6477080583572388, "ref_logps/chosen": -24.755064010620117, "ref_logps/rejected": -28.43872833251953, "rewards/accuracies": 0.7265625, "rewards/chosen": -0.008922239765524864, "rewards/margins": 0.10483534634113312, "rewards/rejected": -0.11375758051872253, "step": 146 }, { "epoch": 1.11, "learning_rate": 3.497191011235955e-07, "logps/chosen": -24.983165740966797, "logps/rejected": -27.753063201904297, "loss": 0.6095, "losses/dpo": 0.6273882389068604, "losses/sft": 0.8987213373184204, "losses/total": 0.6273882389068604, "ref_logps/chosen": -25.17366600036621, "ref_logps/rejected": -26.05366325378418, "rewards/accuracies": 0.8125, "rewards/chosen": 0.019050076603889465, "rewards/margins": 0.1889900416135788, "rewards/rejected": -0.16993993520736694, "step": 147 }, { "epoch": 1.12, "learning_rate": 3.48314606741573e-07, "logps/chosen": -22.61692237854004, "logps/rejected": -27.743179321289062, "loss": 0.6583, "losses/dpo": 0.6790063381195068, "losses/sft": 0.7648496627807617, "losses/total": 0.6790063381195068, "ref_logps/chosen": -22.40664291381836, "ref_logps/rejected": -26.67925262451172, "rewards/accuracies": 0.671875, "rewards/chosen": -0.02102772891521454, "rewards/margins": 0.08536479622125626, "rewards/rejected": -0.1063925176858902, "step": 148 }, { "epoch": 1.12, "learning_rate": 3.469101123595505e-07, "logps/chosen": -22.846782684326172, "logps/rejected": -29.590002059936523, "loss": 0.6261, "losses/dpo": 0.6479306221008301, "losses/sft": 0.8049210906028748, "losses/total": 0.6479306221008301, "ref_logps/chosen": -23.011579513549805, "ref_logps/rejected": -28.2562198638916, "rewards/accuracies": 0.7734375, "rewards/chosen": 0.016479745507240295, "rewards/margins": 0.14985813200473785, "rewards/rejected": -0.13337840139865875, "step": 149 }, { "epoch": 1.13, "learning_rate": 3.4550561797752807e-07, "logps/chosen": -21.699583053588867, "logps/rejected": -27.46141815185547, "loss": 0.6277, "losses/dpo": 0.6358213424682617, "losses/sft": 0.8344307541847229, "losses/total": 0.6358213424682617, "ref_logps/chosen": -21.698383331298828, "ref_logps/rejected": -25.974313735961914, "rewards/accuracies": 0.7734375, "rewards/chosen": -0.00012012943625450134, "rewards/margins": 0.14859014749526978, "rewards/rejected": -0.14871028065681458, "step": 150 }, { "epoch": 1.14, "learning_rate": 3.441011235955056e-07, "logps/chosen": -20.88718032836914, "logps/rejected": -25.436817169189453, "loss": 0.654, "losses/dpo": 0.6406779289245605, "losses/sft": 0.8018806576728821, "losses/total": 0.6406779289245605, "ref_logps/chosen": -20.712448120117188, "ref_logps/rejected": -24.37557029724121, "rewards/accuracies": 0.671875, "rewards/chosen": -0.01747327297925949, "rewards/margins": 0.08865140378475189, "rewards/rejected": -0.10612466931343079, "step": 151 }, { "epoch": 1.15, "learning_rate": 3.4269662921348313e-07, "logps/chosen": -22.312236785888672, "logps/rejected": -30.142927169799805, "loss": 0.6355, "losses/dpo": 0.5932921171188354, "losses/sft": 0.6528638005256653, "losses/total": 0.5932921171188354, "ref_logps/chosen": -22.210554122924805, "ref_logps/rejected": -28.713180541992188, "rewards/accuracies": 0.703125, "rewards/chosen": -0.010168392211198807, "rewards/margins": 0.13280624151229858, "rewards/rejected": -0.1429746299982071, "step": 152 }, { "epoch": 1.15, "learning_rate": 3.4129213483146064e-07, "logps/chosen": -26.28810691833496, "logps/rejected": -29.10406494140625, "loss": 0.6359, "losses/dpo": 0.6205468773841858, "losses/sft": 0.8744308352470398, "losses/total": 0.6205468773841858, "ref_logps/chosen": -26.150360107421875, "ref_logps/rejected": -27.655288696289062, "rewards/accuracies": 0.7890625, "rewards/chosen": -0.013774631544947624, "rewards/margins": 0.1311032772064209, "rewards/rejected": -0.14487791061401367, "step": 153 }, { "epoch": 1.16, "learning_rate": 3.398876404494382e-07, "logps/chosen": -22.283679962158203, "logps/rejected": -26.302614212036133, "loss": 0.6679, "losses/dpo": 0.6655905246734619, "losses/sft": 0.8864909410476685, "losses/total": 0.6655905246734619, "ref_logps/chosen": -21.867923736572266, "ref_logps/rejected": -25.230295181274414, "rewards/accuracies": 0.6484375, "rewards/chosen": -0.04157543182373047, "rewards/margins": 0.0656563863158226, "rewards/rejected": -0.10723182559013367, "step": 154 }, { "epoch": 1.17, "learning_rate": 3.3848314606741575e-07, "logps/chosen": -22.68756103515625, "logps/rejected": -28.45652961730957, "loss": 0.6559, "losses/dpo": 0.6645406484603882, "losses/sft": 0.794353723526001, "losses/total": 0.6645406484603882, "ref_logps/chosen": -22.445241928100586, "ref_logps/rejected": -27.26552963256836, "rewards/accuracies": 0.6328125, "rewards/chosen": -0.024232013151049614, "rewards/margins": 0.09486782550811768, "rewards/rejected": -0.11909983307123184, "step": 155 }, { "epoch": 1.18, "learning_rate": 3.3707865168539325e-07, "logps/chosen": -22.336397171020508, "logps/rejected": -27.09580421447754, "loss": 0.6194, "losses/dpo": 0.5837043523788452, "losses/sft": 0.9716494083404541, "losses/total": 0.5837043523788452, "ref_logps/chosen": -22.35472297668457, "ref_logps/rejected": -25.422115325927734, "rewards/accuracies": 0.7734375, "rewards/chosen": 0.001832372508943081, "rewards/margins": 0.16920123994350433, "rewards/rejected": -0.16736885905265808, "step": 156 }, { "epoch": 1.18, "learning_rate": 3.356741573033708e-07, "logps/chosen": -22.49996566772461, "logps/rejected": -28.435253143310547, "loss": 0.6425, "losses/dpo": 0.6696836948394775, "losses/sft": 0.773880660533905, "losses/total": 0.6696836948394775, "ref_logps/chosen": -22.170368194580078, "ref_logps/rejected": -26.929363250732422, "rewards/accuracies": 0.75, "rewards/chosen": -0.032960131764411926, "rewards/margins": 0.11762877553701401, "rewards/rejected": -0.15058889985084534, "step": 157 }, { "epoch": 1.19, "learning_rate": 3.3426966292134826e-07, "logps/chosen": -22.498619079589844, "logps/rejected": -30.868057250976562, "loss": 0.6295, "losses/dpo": 0.6400988101959229, "losses/sft": 0.724359929561615, "losses/total": 0.6400988101959229, "ref_logps/chosen": -22.199575424194336, "ref_logps/rejected": -29.1011962890625, "rewards/accuracies": 0.7109375, "rewards/chosen": -0.029904408380389214, "rewards/margins": 0.1467815339565277, "rewards/rejected": -0.17668592929840088, "step": 158 }, { "epoch": 1.2, "learning_rate": 3.328651685393258e-07, "logps/chosen": -24.872241973876953, "logps/rejected": -29.327089309692383, "loss": 0.6331, "losses/dpo": 0.6349748373031616, "losses/sft": 0.7728020548820496, "losses/total": 0.6349748373031616, "ref_logps/chosen": -24.600563049316406, "ref_logps/rejected": -27.632978439331055, "rewards/accuracies": 0.6953125, "rewards/chosen": -0.027167750522494316, "rewards/margins": 0.14224328100681305, "rewards/rejected": -0.16941101849079132, "step": 159 }, { "epoch": 1.21, "learning_rate": 3.314606741573033e-07, "logps/chosen": -25.719676971435547, "logps/rejected": -28.384960174560547, "loss": 0.6269, "losses/dpo": 0.6175022721290588, "losses/sft": 0.8887324929237366, "losses/total": 0.6175022721290588, "ref_logps/chosen": -25.583393096923828, "ref_logps/rejected": -26.621837615966797, "rewards/accuracies": 0.703125, "rewards/chosen": -0.013628311455249786, "rewards/margins": 0.16268408298492432, "rewards/rejected": -0.1763123720884323, "step": 160 }, { "epoch": 1.22, "learning_rate": 3.300561797752809e-07, "logps/chosen": -20.547767639160156, "logps/rejected": -26.39871597290039, "loss": 0.6418, "losses/dpo": 0.604182243347168, "losses/sft": 0.63340824842453, "losses/total": 0.604182243347168, "ref_logps/chosen": -20.272342681884766, "ref_logps/rejected": -24.899860382080078, "rewards/accuracies": 0.7109375, "rewards/chosen": -0.027542442083358765, "rewards/margins": 0.12234312295913696, "rewards/rejected": -0.14988556504249573, "step": 161 }, { "epoch": 1.22, "learning_rate": 3.2865168539325844e-07, "logps/chosen": -22.42629623413086, "logps/rejected": -27.69287872314453, "loss": 0.6111, "losses/dpo": 0.5942946672439575, "losses/sft": 0.9472201466560364, "losses/total": 0.5942946672439575, "ref_logps/chosen": -22.135528564453125, "ref_logps/rejected": -25.469520568847656, "rewards/accuracies": 0.7421875, "rewards/chosen": -0.029076654464006424, "rewards/margins": 0.1932588517665863, "rewards/rejected": -0.22233551740646362, "step": 162 }, { "epoch": 1.23, "learning_rate": 3.2724719101123594e-07, "logps/chosen": -23.306896209716797, "logps/rejected": -28.64287567138672, "loss": 0.6467, "losses/dpo": 0.6821013689041138, "losses/sft": 0.9050745368003845, "losses/total": 0.6821013689041138, "ref_logps/chosen": -22.86626625061035, "ref_logps/rejected": -27.0958251953125, "rewards/accuracies": 0.671875, "rewards/chosen": -0.04406279698014259, "rewards/margins": 0.11064193397760391, "rewards/rejected": -0.1547047346830368, "step": 163 }, { "epoch": 1.24, "learning_rate": 3.258426966292135e-07, "logps/chosen": -24.126543045043945, "logps/rejected": -26.020713806152344, "loss": 0.6214, "losses/dpo": 0.6081950664520264, "losses/sft": 0.827450692653656, "losses/total": 0.6081950664520264, "ref_logps/chosen": -23.934072494506836, "ref_logps/rejected": -24.092741012573242, "rewards/accuracies": 0.765625, "rewards/chosen": -0.01924710161983967, "rewards/margins": 0.17355017364025116, "rewards/rejected": -0.19279725849628448, "step": 164 }, { "epoch": 1.25, "learning_rate": 3.24438202247191e-07, "logps/chosen": -23.07083511352539, "logps/rejected": -29.666513442993164, "loss": 0.6401, "losses/dpo": 0.6096771955490112, "losses/sft": 0.7951339483261108, "losses/total": 0.6096771955490112, "ref_logps/chosen": -22.55157470703125, "ref_logps/rejected": -27.86458396911621, "rewards/accuracies": 0.6796875, "rewards/chosen": -0.05192602425813675, "rewards/margins": 0.12826718389987946, "rewards/rejected": -0.1801932007074356, "step": 165 }, { "epoch": 1.25, "learning_rate": 3.2303370786516856e-07, "logps/chosen": -23.97926139831543, "logps/rejected": -26.387611389160156, "loss": 0.6543, "losses/dpo": 0.5806229710578918, "losses/sft": 0.9021787047386169, "losses/total": 0.5806229710578918, "ref_logps/chosen": -23.097957611083984, "ref_logps/rejected": -24.48831558227539, "rewards/accuracies": 0.609375, "rewards/chosen": -0.0881301686167717, "rewards/margins": 0.10179921984672546, "rewards/rejected": -0.18992936611175537, "step": 166 }, { "epoch": 1.26, "learning_rate": 3.21629213483146e-07, "logps/chosen": -24.496349334716797, "logps/rejected": -28.20893669128418, "loss": 0.6439, "losses/dpo": 0.5786381959915161, "losses/sft": 0.9020153284072876, "losses/total": 0.5786381959915161, "ref_logps/chosen": -24.026926040649414, "ref_logps/rejected": -26.54248046875, "rewards/accuracies": 0.6484375, "rewards/chosen": -0.04694243520498276, "rewards/margins": 0.11970352381467819, "rewards/rejected": -0.16664597392082214, "step": 167 }, { "epoch": 1.27, "learning_rate": 3.2022471910112357e-07, "logps/chosen": -23.227306365966797, "logps/rejected": -29.19955825805664, "loss": 0.6389, "losses/dpo": 0.6521559953689575, "losses/sft": 0.9907703399658203, "losses/total": 0.6521559953689575, "ref_logps/chosen": -22.804248809814453, "ref_logps/rejected": -27.36874008178711, "rewards/accuracies": 0.640625, "rewards/chosen": -0.042305897921323776, "rewards/margins": 0.14077602326869965, "rewards/rejected": -0.18308192491531372, "step": 168 }, { "epoch": 1.28, "learning_rate": 3.1882022471910107e-07, "logps/chosen": -22.211841583251953, "logps/rejected": -27.533721923828125, "loss": 0.6512, "losses/dpo": 0.6903020143508911, "losses/sft": 0.8463045358657837, "losses/total": 0.6903020143508911, "ref_logps/chosen": -21.630611419677734, "ref_logps/rejected": -25.909526824951172, "rewards/accuracies": 0.6640625, "rewards/chosen": -0.05812288075685501, "rewards/margins": 0.10429678112268448, "rewards/rejected": -0.1624196618795395, "step": 169 }, { "epoch": 1.28, "learning_rate": 3.1741573033707863e-07, "logps/chosen": -22.332489013671875, "logps/rejected": -28.400074005126953, "loss": 0.6155, "losses/dpo": 0.6296464204788208, "losses/sft": 0.6626120805740356, "losses/total": 0.6296464204788208, "ref_logps/chosen": -22.126087188720703, "ref_logps/rejected": -26.414535522460938, "rewards/accuracies": 0.796875, "rewards/chosen": -0.020640213042497635, "rewards/margins": 0.17791378498077393, "rewards/rejected": -0.19855400919914246, "step": 170 }, { "epoch": 1.29, "learning_rate": 3.160112359550562e-07, "logps/chosen": -23.771900177001953, "logps/rejected": -30.088207244873047, "loss": 0.5971, "losses/dpo": 0.6422166228294373, "losses/sft": 0.7472187876701355, "losses/total": 0.6422166228294373, "ref_logps/chosen": -23.688966751098633, "ref_logps/rejected": -27.655853271484375, "rewards/accuracies": 0.7734375, "rewards/chosen": -0.008293594233691692, "rewards/margins": 0.23494186997413635, "rewards/rejected": -0.24323543906211853, "step": 171 }, { "epoch": 1.3, "learning_rate": 3.146067415730337e-07, "logps/chosen": -23.348037719726562, "logps/rejected": -27.53687286376953, "loss": 0.6459, "losses/dpo": 0.6455183029174805, "losses/sft": 0.8395851850509644, "losses/total": 0.6455183029174805, "ref_logps/chosen": -22.63860511779785, "ref_logps/rejected": -25.60868263244629, "rewards/accuracies": 0.6640625, "rewards/chosen": -0.0709431990981102, "rewards/margins": 0.12187594175338745, "rewards/rejected": -0.19281914830207825, "step": 172 }, { "epoch": 1.31, "learning_rate": 3.1320224719101125e-07, "logps/chosen": -24.17770767211914, "logps/rejected": -30.49142074584961, "loss": 0.627, "losses/dpo": 0.6627662181854248, "losses/sft": 0.9079832434654236, "losses/total": 0.6627662181854248, "ref_logps/chosen": -23.401166915893555, "ref_logps/rejected": -28.0411376953125, "rewards/accuracies": 0.6796875, "rewards/chosen": -0.07765418291091919, "rewards/margins": 0.16737422347068787, "rewards/rejected": -0.24502840638160706, "step": 173 }, { "epoch": 1.31, "learning_rate": 3.1179775280898875e-07, "logps/chosen": -24.392324447631836, "logps/rejected": -27.670101165771484, "loss": 0.6251, "losses/dpo": 0.6143248081207275, "losses/sft": 0.6558141112327576, "losses/total": 0.6143248081207275, "ref_logps/chosen": -23.9196720123291, "ref_logps/rejected": -25.53693389892578, "rewards/accuracies": 0.7265625, "rewards/chosen": -0.047265198081731796, "rewards/margins": 0.16605158150196075, "rewards/rejected": -0.21331676840782166, "step": 174 }, { "epoch": 1.32, "learning_rate": 3.103932584269663e-07, "logps/chosen": -24.742660522460938, "logps/rejected": -33.37188720703125, "loss": 0.6157, "losses/dpo": 0.5933184623718262, "losses/sft": 0.9941530227661133, "losses/total": 0.5933184623718262, "ref_logps/chosen": -24.150442123413086, "ref_logps/rejected": -30.931093215942383, "rewards/accuracies": 0.71875, "rewards/chosen": -0.05922209471464157, "rewards/margins": 0.1848573535680771, "rewards/rejected": -0.24407947063446045, "step": 175 }, { "epoch": 1.33, "learning_rate": 3.0898876404494376e-07, "logps/chosen": -23.68863296508789, "logps/rejected": -28.516223907470703, "loss": 0.6428, "losses/dpo": 0.6548395156860352, "losses/sft": 0.9564076066017151, "losses/total": 0.6548395156860352, "ref_logps/chosen": -22.73943519592285, "ref_logps/rejected": -26.3284912109375, "rewards/accuracies": 0.6875, "rewards/chosen": -0.09491994976997375, "rewards/margins": 0.12385320663452148, "rewards/rejected": -0.21877314150333405, "step": 176 }, { "epoch": 1.34, "learning_rate": 3.075842696629213e-07, "logps/chosen": -22.527427673339844, "logps/rejected": -31.241607666015625, "loss": 0.6179, "losses/dpo": 0.5700336694717407, "losses/sft": 0.8869008421897888, "losses/total": 0.5700336694717407, "ref_logps/chosen": -21.73688507080078, "ref_logps/rejected": -28.55950164794922, "rewards/accuracies": 0.6875, "rewards/chosen": -0.07905411720275879, "rewards/margins": 0.18915657699108124, "rewards/rejected": -0.2682107090950012, "step": 177 }, { "epoch": 1.34, "learning_rate": 3.0617977528089887e-07, "logps/chosen": -22.916969299316406, "logps/rejected": -26.000946044921875, "loss": 0.6425, "losses/dpo": 0.651595413684845, "losses/sft": 0.8127326369285583, "losses/total": 0.651595413684845, "ref_logps/chosen": -22.194671630859375, "ref_logps/rejected": -23.969348907470703, "rewards/accuracies": 0.671875, "rewards/chosen": -0.07222998142242432, "rewards/margins": 0.1309295892715454, "rewards/rejected": -0.20315957069396973, "step": 178 }, { "epoch": 1.35, "learning_rate": 3.047752808988764e-07, "logps/chosen": -20.320987701416016, "logps/rejected": -27.46251106262207, "loss": 0.6217, "losses/dpo": 0.7334872484207153, "losses/sft": 0.9430239200592041, "losses/total": 0.7334872484207153, "ref_logps/chosen": -19.424144744873047, "ref_logps/rejected": -24.763113021850586, "rewards/accuracies": 0.6953125, "rewards/chosen": -0.08968427777290344, "rewards/margins": 0.18025556206703186, "rewards/rejected": -0.2699398398399353, "step": 179 }, { "epoch": 1.36, "learning_rate": 3.0337078651685393e-07, "logps/chosen": -23.853857040405273, "logps/rejected": -27.422889709472656, "loss": 0.6381, "losses/dpo": 0.6393001079559326, "losses/sft": 0.766620397567749, "losses/total": 0.6393001079559326, "ref_logps/chosen": -22.798233032226562, "ref_logps/rejected": -24.980205535888672, "rewards/accuracies": 0.6796875, "rewards/chosen": -0.10556241869926453, "rewards/margins": 0.13870559632778168, "rewards/rejected": -0.2442680299282074, "step": 180 }, { "epoch": 1.37, "learning_rate": 3.0196629213483144e-07, "logps/chosen": -23.360549926757812, "logps/rejected": -27.110477447509766, "loss": 0.6234, "losses/dpo": 0.6311055421829224, "losses/sft": 0.9324018955230713, "losses/total": 0.6311055421829224, "ref_logps/chosen": -22.890331268310547, "ref_logps/rejected": -24.903316497802734, "rewards/accuracies": 0.7421875, "rewards/chosen": -0.04702185466885567, "rewards/margins": 0.17369432747364044, "rewards/rejected": -0.2207161784172058, "step": 181 }, { "epoch": 1.37, "learning_rate": 3.00561797752809e-07, "logps/chosen": -23.004093170166016, "logps/rejected": -31.04292106628418, "loss": 0.5926, "losses/dpo": 0.6243355870246887, "losses/sft": 0.8456003665924072, "losses/total": 0.6243355870246887, "ref_logps/chosen": -22.555362701416016, "ref_logps/rejected": -28.14826011657715, "rewards/accuracies": 0.7578125, "rewards/chosen": -0.04487309604883194, "rewards/margins": 0.24459321796894073, "rewards/rejected": -0.28946632146835327, "step": 182 }, { "epoch": 1.38, "learning_rate": 2.991573033707865e-07, "logps/chosen": -26.668237686157227, "logps/rejected": -30.511489868164062, "loss": 0.6099, "losses/dpo": 0.6743872761726379, "losses/sft": 0.836949348449707, "losses/total": 0.6743872761726379, "ref_logps/chosen": -25.680599212646484, "ref_logps/rejected": -27.35342788696289, "rewards/accuracies": 0.7265625, "rewards/chosen": -0.09876400232315063, "rewards/margins": 0.2170422226190567, "rewards/rejected": -0.31580623984336853, "step": 183 }, { "epoch": 1.39, "learning_rate": 2.9775280898876406e-07, "logps/chosen": -23.974590301513672, "logps/rejected": -28.162975311279297, "loss": 0.6119, "losses/dpo": 0.5823447704315186, "losses/sft": 0.8065779805183411, "losses/total": 0.5823447704315186, "ref_logps/chosen": -23.180667877197266, "ref_logps/rejected": -25.296037673950195, "rewards/accuracies": 0.7109375, "rewards/chosen": -0.07939236611127853, "rewards/margins": 0.20730134844779968, "rewards/rejected": -0.2866936922073364, "step": 184 }, { "epoch": 1.4, "learning_rate": 2.9634831460674156e-07, "logps/chosen": -24.01116943359375, "logps/rejected": -30.05943489074707, "loss": 0.6203, "losses/dpo": 0.5889841318130493, "losses/sft": 0.8877280354499817, "losses/total": 0.5889841318130493, "ref_logps/chosen": -22.781108856201172, "ref_logps/rejected": -27.01274871826172, "rewards/accuracies": 0.734375, "rewards/chosen": -0.12300599366426468, "rewards/margins": 0.18166252970695496, "rewards/rejected": -0.3046685457229614, "step": 185 }, { "epoch": 1.4, "learning_rate": 2.9494382022471906e-07, "logps/chosen": -22.79621124267578, "logps/rejected": -28.1258544921875, "loss": 0.6198, "losses/dpo": 0.6025291681289673, "losses/sft": 0.93308424949646, "losses/total": 0.6025291681289673, "ref_logps/chosen": -21.591278076171875, "ref_logps/rejected": -25.04897689819336, "rewards/accuracies": 0.7421875, "rewards/chosen": -0.12049318104982376, "rewards/margins": 0.18719442188739777, "rewards/rejected": -0.3076876401901245, "step": 186 }, { "epoch": 1.41, "learning_rate": 2.935393258426966e-07, "logps/chosen": -24.246837615966797, "logps/rejected": -30.58446502685547, "loss": 0.6277, "losses/dpo": 0.5978178977966309, "losses/sft": 0.7778979539871216, "losses/total": 0.5978178977966309, "ref_logps/chosen": -23.127248764038086, "ref_logps/rejected": -27.7061767578125, "rewards/accuracies": 0.6875, "rewards/chosen": -0.11195877939462662, "rewards/margins": 0.17587023973464966, "rewards/rejected": -0.2878290116786957, "step": 187 }, { "epoch": 1.42, "learning_rate": 2.921348314606741e-07, "logps/chosen": -24.55533218383789, "logps/rejected": -29.098743438720703, "loss": 0.6458, "losses/dpo": 0.6147331595420837, "losses/sft": 0.8299495577812195, "losses/total": 0.6147331595420837, "ref_logps/chosen": -23.001358032226562, "ref_logps/rejected": -26.009681701660156, "rewards/accuracies": 0.59375, "rewards/chosen": -0.1553977131843567, "rewards/margins": 0.15350814163684845, "rewards/rejected": -0.30890583992004395, "step": 188 }, { "epoch": 1.43, "learning_rate": 2.907303370786517e-07, "logps/chosen": -22.7973690032959, "logps/rejected": -30.61502456665039, "loss": 0.5968, "losses/dpo": 0.5409806370735168, "losses/sft": 0.8110998272895813, "losses/total": 0.5409806370735168, "ref_logps/chosen": -21.900728225708008, "ref_logps/rejected": -27.346271514892578, "rewards/accuracies": 0.7578125, "rewards/chosen": -0.08966411650180817, "rewards/margins": 0.23721098899841309, "rewards/rejected": -0.32687509059906006, "step": 189 }, { "epoch": 1.43, "learning_rate": 2.893258426966292e-07, "logps/chosen": -21.656837463378906, "logps/rejected": -28.09313201904297, "loss": 0.636, "losses/dpo": 0.6395488977432251, "losses/sft": 0.8838689923286438, "losses/total": 0.6395488977432251, "ref_logps/chosen": -20.17813491821289, "ref_logps/rejected": -25.027902603149414, "rewards/accuracies": 0.6484375, "rewards/chosen": -0.14787010848522186, "rewards/margins": 0.15865309536457062, "rewards/rejected": -0.3065232038497925, "step": 190 }, { "epoch": 1.44, "learning_rate": 2.8792134831460674e-07, "logps/chosen": -23.13861083984375, "logps/rejected": -32.06410217285156, "loss": 0.6131, "losses/dpo": 0.6822565197944641, "losses/sft": 0.7876338362693787, "losses/total": 0.6822565197944641, "ref_logps/chosen": -21.99342918395996, "ref_logps/rejected": -28.761310577392578, "rewards/accuracies": 0.71875, "rewards/chosen": -0.11451825499534607, "rewards/margins": 0.21576061844825745, "rewards/rejected": -0.33027884364128113, "step": 191 }, { "epoch": 1.45, "learning_rate": 2.8651685393258425e-07, "logps/chosen": -22.36726951599121, "logps/rejected": -27.791099548339844, "loss": 0.6132, "losses/dpo": 0.5694007873535156, "losses/sft": 0.7940797805786133, "losses/total": 0.5694007873535156, "ref_logps/chosen": -21.662071228027344, "ref_logps/rejected": -25.100269317626953, "rewards/accuracies": 0.7265625, "rewards/chosen": -0.07051999121904373, "rewards/margins": 0.19856315851211548, "rewards/rejected": -0.2690831422805786, "step": 192 }, { "epoch": 1.46, "learning_rate": 2.851123595505618e-07, "logps/chosen": -24.541927337646484, "logps/rejected": -30.479598999023438, "loss": 0.6251, "losses/dpo": 0.6676912307739258, "losses/sft": 0.8101266026496887, "losses/total": 0.6676912307739258, "ref_logps/chosen": -23.375761032104492, "ref_logps/rejected": -27.565099716186523, "rewards/accuracies": 0.7109375, "rewards/chosen": -0.11661653220653534, "rewards/margins": 0.1748332977294922, "rewards/rejected": -0.2914498448371887, "step": 193 }, { "epoch": 1.46, "learning_rate": 2.8370786516853936e-07, "logps/chosen": -23.954505920410156, "logps/rejected": -30.262849807739258, "loss": 0.6289, "losses/dpo": 0.6359354257583618, "losses/sft": 0.846460223197937, "losses/total": 0.6359354257583618, "ref_logps/chosen": -22.69145965576172, "ref_logps/rejected": -27.221202850341797, "rewards/accuracies": 0.671875, "rewards/chosen": -0.12630482017993927, "rewards/margins": 0.1778600960969925, "rewards/rejected": -0.30416491627693176, "step": 194 }, { "epoch": 1.47, "learning_rate": 2.823033707865168e-07, "logps/chosen": -25.615474700927734, "logps/rejected": -32.26765823364258, "loss": 0.6017, "losses/dpo": 0.6264960765838623, "losses/sft": 0.906339704990387, "losses/total": 0.6264960765838623, "ref_logps/chosen": -24.62253189086914, "ref_logps/rejected": -29.02202796936035, "rewards/accuracies": 0.7421875, "rewards/chosen": -0.09929438680410385, "rewards/margins": 0.22526855766773224, "rewards/rejected": -0.3245629370212555, "step": 195 }, { "epoch": 1.48, "learning_rate": 2.8089887640449437e-07, "logps/chosen": -22.84251594543457, "logps/rejected": -28.347021102905273, "loss": 0.6191, "losses/dpo": 0.6483104825019836, "losses/sft": 0.9074235558509827, "losses/total": 0.6483104825019836, "ref_logps/chosen": -21.419048309326172, "ref_logps/rejected": -25.032745361328125, "rewards/accuracies": 0.703125, "rewards/chosen": -0.14234672486782074, "rewards/margins": 0.18908075988292694, "rewards/rejected": -0.3314274847507477, "step": 196 }, { "epoch": 1.49, "learning_rate": 2.794943820224719e-07, "logps/chosen": -23.73548126220703, "logps/rejected": -28.329975128173828, "loss": 0.6238, "losses/dpo": 0.6014984250068665, "losses/sft": 0.773016631603241, "losses/total": 0.6014984250068665, "ref_logps/chosen": -22.425506591796875, "ref_logps/rejected": -25.099872589111328, "rewards/accuracies": 0.6796875, "rewards/chosen": -0.13099724054336548, "rewards/margins": 0.19201286137104034, "rewards/rejected": -0.323010116815567, "step": 197 }, { "epoch": 1.49, "learning_rate": 2.7808988764044943e-07, "logps/chosen": -26.183156967163086, "logps/rejected": -30.921403884887695, "loss": 0.625, "losses/dpo": 0.6309884190559387, "losses/sft": 0.8918415307998657, "losses/total": 0.6309884190559387, "ref_logps/chosen": -24.701202392578125, "ref_logps/rejected": -27.679357528686523, "rewards/accuracies": 0.6953125, "rewards/chosen": -0.14819550514221191, "rewards/margins": 0.17600935697555542, "rewards/rejected": -0.32420486211776733, "step": 198 }, { "epoch": 1.5, "learning_rate": 2.7668539325842694e-07, "logps/chosen": -23.88658905029297, "logps/rejected": -29.73432731628418, "loss": 0.6156, "losses/dpo": 0.6188192367553711, "losses/sft": 0.8410817384719849, "losses/total": 0.6188192367553711, "ref_logps/chosen": -22.504894256591797, "ref_logps/rejected": -26.384294509887695, "rewards/accuracies": 0.6328125, "rewards/chosen": -0.13816949725151062, "rewards/margins": 0.19683387875556946, "rewards/rejected": -0.3350033462047577, "step": 199 }, { "epoch": 1.51, "learning_rate": 2.752808988764045e-07, "logps/chosen": -23.145811080932617, "logps/rejected": -29.627685546875, "loss": 0.637, "losses/dpo": 0.6995939612388611, "losses/sft": 0.9283435344696045, "losses/total": 0.6995939612388611, "ref_logps/chosen": -21.45529556274414, "ref_logps/rejected": -26.336702346801758, "rewards/accuracies": 0.625, "rewards/chosen": -0.1690514236688614, "rewards/margins": 0.16004663705825806, "rewards/rejected": -0.32909804582595825, "step": 200 }, { "epoch": 1.52, "learning_rate": 2.73876404494382e-07, "logps/chosen": -22.545406341552734, "logps/rejected": -30.04849624633789, "loss": 0.608, "losses/dpo": 0.6513813734054565, "losses/sft": 0.9403305649757385, "losses/total": 0.6513813734054565, "ref_logps/chosen": -21.010854721069336, "ref_logps/rejected": -26.149032592773438, "rewards/accuracies": 0.671875, "rewards/chosen": -0.15345513820648193, "rewards/margins": 0.23649117350578308, "rewards/rejected": -0.3899462819099426, "step": 201 }, { "epoch": 1.52, "learning_rate": 2.7247191011235955e-07, "logps/chosen": -22.640438079833984, "logps/rejected": -28.583681106567383, "loss": 0.6007, "losses/dpo": 0.5443820357322693, "losses/sft": 0.8517413139343262, "losses/total": 0.5443820357322693, "ref_logps/chosen": -21.29751968383789, "ref_logps/rejected": -24.850605010986328, "rewards/accuracies": 0.765625, "rewards/chosen": -0.1342916190624237, "rewards/margins": 0.23901620507240295, "rewards/rejected": -0.37330782413482666, "step": 202 }, { "epoch": 1.53, "learning_rate": 2.710674157303371e-07, "logps/chosen": -25.259624481201172, "logps/rejected": -32.96052551269531, "loss": 0.6029, "losses/dpo": 0.5749891996383667, "losses/sft": 0.9417051672935486, "losses/total": 0.5749891996383667, "ref_logps/chosen": -23.468887329101562, "ref_logps/rejected": -28.776565551757812, "rewards/accuracies": 0.7421875, "rewards/chosen": -0.17907381057739258, "rewards/margins": 0.23932181298732758, "rewards/rejected": -0.41839560866355896, "step": 203 }, { "epoch": 1.54, "learning_rate": 2.6966292134831456e-07, "logps/chosen": -24.431142807006836, "logps/rejected": -31.409852981567383, "loss": 0.6256, "losses/dpo": 0.6045551896095276, "losses/sft": 0.8162484169006348, "losses/total": 0.6045551896095276, "ref_logps/chosen": -22.7187442779541, "ref_logps/rejected": -27.74604606628418, "rewards/accuracies": 0.703125, "rewards/chosen": -0.1712397187948227, "rewards/margins": 0.19514092803001404, "rewards/rejected": -0.36638063192367554, "step": 204 }, { "epoch": 1.55, "learning_rate": 2.682584269662921e-07, "logps/chosen": -22.776988983154297, "logps/rejected": -30.418426513671875, "loss": 0.6093, "losses/dpo": 0.630817711353302, "losses/sft": 0.907343327999115, "losses/total": 0.630817711353302, "ref_logps/chosen": -21.150266647338867, "ref_logps/rejected": -26.52399444580078, "rewards/accuracies": 0.734375, "rewards/chosen": -0.16267219185829163, "rewards/margins": 0.22677099704742432, "rewards/rejected": -0.38944315910339355, "step": 205 }, { "epoch": 1.55, "learning_rate": 2.668539325842696e-07, "logps/chosen": -24.300395965576172, "logps/rejected": -32.63694763183594, "loss": 0.5834, "losses/dpo": 0.5977815389633179, "losses/sft": 0.8870611190795898, "losses/total": 0.5977815389633179, "ref_logps/chosen": -22.923202514648438, "ref_logps/rejected": -28.30066680908203, "rewards/accuracies": 0.71875, "rewards/chosen": -0.13771943747997284, "rewards/margins": 0.29590874910354614, "rewards/rejected": -0.4336281716823578, "step": 206 }, { "epoch": 1.56, "learning_rate": 2.654494382022472e-07, "logps/chosen": -25.562063217163086, "logps/rejected": -28.686279296875, "loss": 0.6248, "losses/dpo": 0.593975841999054, "losses/sft": 0.8298511505126953, "losses/total": 0.593975841999054, "ref_logps/chosen": -23.863605499267578, "ref_logps/rejected": -24.980735778808594, "rewards/accuracies": 0.6171875, "rewards/chosen": -0.16984564065933228, "rewards/margins": 0.2007087767124176, "rewards/rejected": -0.3705544173717499, "step": 207 }, { "epoch": 1.57, "learning_rate": 2.640449438202247e-07, "logps/chosen": -24.133087158203125, "logps/rejected": -32.86896514892578, "loss": 0.6072, "losses/dpo": 0.5785881280899048, "losses/sft": 0.9283973574638367, "losses/total": 0.5785881280899048, "ref_logps/chosen": -22.546520233154297, "ref_logps/rejected": -28.988473892211914, "rewards/accuracies": 0.671875, "rewards/chosen": -0.158656507730484, "rewards/margins": 0.2293928861618042, "rewards/rejected": -0.3880493640899658, "step": 208 }, { "epoch": 1.58, "learning_rate": 2.6264044943820224e-07, "logps/chosen": -21.610166549682617, "logps/rejected": -33.77753448486328, "loss": 0.5743, "losses/dpo": 0.5111271142959595, "losses/sft": 0.7807843685150146, "losses/total": 0.5111271142959595, "ref_logps/chosen": -20.26101303100586, "ref_logps/rejected": -29.390432357788086, "rewards/accuracies": 0.7578125, "rewards/chosen": -0.1349155306816101, "rewards/margins": 0.3037945628166199, "rewards/rejected": -0.43871009349823, "step": 209 }, { "epoch": 1.58, "learning_rate": 2.612359550561798e-07, "logps/chosen": -24.600027084350586, "logps/rejected": -28.993408203125, "loss": 0.621, "losses/dpo": 0.6254321336746216, "losses/sft": 0.7647839188575745, "losses/total": 0.6254321336746216, "ref_logps/chosen": -22.838638305664062, "ref_logps/rejected": -25.172962188720703, "rewards/accuracies": 0.6796875, "rewards/chosen": -0.1761387437582016, "rewards/margins": 0.20590564608573914, "rewards/rejected": -0.38204440474510193, "step": 210 }, { "epoch": 1.59, "learning_rate": 2.598314606741573e-07, "logps/chosen": -25.24309730529785, "logps/rejected": -32.02477264404297, "loss": 0.6078, "losses/dpo": 0.6571998000144958, "losses/sft": 0.8880329728126526, "losses/total": 0.6571998000144958, "ref_logps/chosen": -23.272363662719727, "ref_logps/rejected": -27.582080841064453, "rewards/accuracies": 0.71875, "rewards/chosen": -0.19707328081130981, "rewards/margins": 0.24719560146331787, "rewards/rejected": -0.4442688822746277, "step": 211 }, { "epoch": 1.6, "learning_rate": 2.5842696629213486e-07, "logps/chosen": -23.570541381835938, "logps/rejected": -31.662994384765625, "loss": 0.5954, "losses/dpo": 0.6153095960617065, "losses/sft": 0.7867841720581055, "losses/total": 0.6153095960617065, "ref_logps/chosen": -21.58125114440918, "ref_logps/rejected": -27.1029052734375, "rewards/accuracies": 0.78125, "rewards/chosen": -0.19892916083335876, "rewards/margins": 0.2570798993110657, "rewards/rejected": -0.45600906014442444, "step": 212 }, { "epoch": 1.61, "learning_rate": 2.5702247191011236e-07, "logps/chosen": -26.515090942382812, "logps/rejected": -33.26690673828125, "loss": 0.5944, "losses/dpo": 0.559239387512207, "losses/sft": 0.8030417561531067, "losses/total": 0.559239387512207, "ref_logps/chosen": -24.70389175415039, "ref_logps/rejected": -28.788631439208984, "rewards/accuracies": 0.7421875, "rewards/chosen": -0.18111974000930786, "rewards/margins": 0.2667076587677002, "rewards/rejected": -0.44782739877700806, "step": 213 }, { "epoch": 1.62, "learning_rate": 2.5561797752808987e-07, "logps/chosen": -23.109725952148438, "logps/rejected": -30.950822830200195, "loss": 0.6028, "losses/dpo": 0.6463332772254944, "losses/sft": 0.867030918598175, "losses/total": 0.6463332772254944, "ref_logps/chosen": -21.529489517211914, "ref_logps/rejected": -26.95291519165039, "rewards/accuracies": 0.7421875, "rewards/chosen": -0.15802377462387085, "rewards/margins": 0.24176692962646484, "rewards/rejected": -0.3997907340526581, "step": 214 }, { "epoch": 1.62, "learning_rate": 2.5421348314606737e-07, "logps/chosen": -22.45772933959961, "logps/rejected": -30.6645450592041, "loss": 0.548, "losses/dpo": 0.49787038564682007, "losses/sft": 0.9076435565948486, "losses/total": 0.49787038564682007, "ref_logps/chosen": -21.689294815063477, "ref_logps/rejected": -26.167482376098633, "rewards/accuracies": 0.796875, "rewards/chosen": -0.07684363424777985, "rewards/margins": 0.37286245822906494, "rewards/rejected": -0.449706107378006, "step": 215 }, { "epoch": 1.63, "learning_rate": 2.5280898876404493e-07, "logps/chosen": -23.930644989013672, "logps/rejected": -31.34885597229004, "loss": 0.5791, "losses/dpo": 0.6228358745574951, "losses/sft": 0.894844651222229, "losses/total": 0.6228358745574951, "ref_logps/chosen": -22.003002166748047, "ref_logps/rejected": -26.41282844543457, "rewards/accuracies": 0.7265625, "rewards/chosen": -0.19276437163352966, "rewards/margins": 0.3008383512496948, "rewards/rejected": -0.4936027228832245, "step": 216 }, { "epoch": 1.64, "learning_rate": 2.5140449438202243e-07, "logps/chosen": -25.59225082397461, "logps/rejected": -30.82415199279785, "loss": 0.5571, "losses/dpo": 0.5233840942382812, "losses/sft": 0.8860921263694763, "losses/total": 0.5233840942382812, "ref_logps/chosen": -23.89864730834961, "ref_logps/rejected": -25.53179168701172, "rewards/accuracies": 0.765625, "rewards/chosen": -0.16936028003692627, "rewards/margins": 0.35987579822540283, "rewards/rejected": -0.5292361378669739, "step": 217 }, { "epoch": 1.65, "learning_rate": 2.5e-07, "logps/chosen": -26.896615982055664, "logps/rejected": -32.64814376831055, "loss": 0.5821, "losses/dpo": 0.5345016121864319, "losses/sft": 0.9819333553314209, "losses/total": 0.5345016121864319, "ref_logps/chosen": -24.95808982849121, "ref_logps/rejected": -27.606571197509766, "rewards/accuracies": 0.703125, "rewards/chosen": -0.19385257363319397, "rewards/margins": 0.3103046417236328, "rewards/rejected": -0.5041571855545044, "step": 218 }, { "epoch": 1.65, "learning_rate": 2.485955056179775e-07, "logps/chosen": -21.461519241333008, "logps/rejected": -29.887657165527344, "loss": 0.5621, "losses/dpo": 0.5603345632553101, "losses/sft": 0.7855640649795532, "losses/total": 0.5603345632553101, "ref_logps/chosen": -20.122406005859375, "ref_logps/rejected": -24.85255241394043, "rewards/accuracies": 0.75, "rewards/chosen": -0.13391147553920746, "rewards/margins": 0.3695991039276123, "rewards/rejected": -0.503510594367981, "step": 219 }, { "epoch": 1.66, "learning_rate": 2.4719101123595505e-07, "logps/chosen": -22.143098831176758, "logps/rejected": -34.4566764831543, "loss": 0.5923, "losses/dpo": 0.5465586185455322, "losses/sft": 1.051912546157837, "losses/total": 0.5465586185455322, "ref_logps/chosen": -20.453866958618164, "ref_logps/rejected": -30.012981414794922, "rewards/accuracies": 0.6796875, "rewards/chosen": -0.16892319917678833, "rewards/margins": 0.27544665336608887, "rewards/rejected": -0.4443698525428772, "step": 220 }, { "epoch": 1.67, "learning_rate": 2.4578651685393255e-07, "logps/chosen": -24.042566299438477, "logps/rejected": -29.772445678710938, "loss": 0.6149, "losses/dpo": 0.6469910144805908, "losses/sft": 1.0151987075805664, "losses/total": 0.6469910144805908, "ref_logps/chosen": -21.836162567138672, "ref_logps/rejected": -25.394845962524414, "rewards/accuracies": 0.6328125, "rewards/chosen": -0.220640629529953, "rewards/margins": 0.2171194702386856, "rewards/rejected": -0.4377601146697998, "step": 221 }, { "epoch": 1.68, "learning_rate": 2.443820224719101e-07, "logps/chosen": -24.834793090820312, "logps/rejected": -33.834083557128906, "loss": 0.5676, "losses/dpo": 0.6051491498947144, "losses/sft": 0.8380707502365112, "losses/total": 0.6051491498947144, "ref_logps/chosen": -22.940967559814453, "ref_logps/rejected": -28.49428939819336, "rewards/accuracies": 0.75, "rewards/chosen": -0.18938273191452026, "rewards/margins": 0.3445969223976135, "rewards/rejected": -0.5339796543121338, "step": 222 }, { "epoch": 1.68, "learning_rate": 2.429775280898876e-07, "logps/chosen": -25.5327091217041, "logps/rejected": -30.429113388061523, "loss": 0.6089, "losses/dpo": 0.5853685140609741, "losses/sft": 0.6926910877227783, "losses/total": 0.5853685140609741, "ref_logps/chosen": -23.138214111328125, "ref_logps/rejected": -25.674575805664062, "rewards/accuracies": 0.7109375, "rewards/chosen": -0.23944953083992004, "rewards/margins": 0.2360040545463562, "rewards/rejected": -0.47545361518859863, "step": 223 }, { "epoch": 1.69, "learning_rate": 2.4157303370786517e-07, "logps/chosen": -24.123153686523438, "logps/rejected": -29.51090431213379, "loss": 0.6134, "losses/dpo": 0.7566800117492676, "losses/sft": 0.9139145612716675, "losses/total": 0.7566800117492676, "ref_logps/chosen": -22.065155029296875, "ref_logps/rejected": -24.94894027709961, "rewards/accuracies": 0.7109375, "rewards/chosen": -0.20579975843429565, "rewards/margins": 0.250396728515625, "rewards/rejected": -0.45619648694992065, "step": 224 }, { "epoch": 1.7, "learning_rate": 2.401685393258427e-07, "logps/chosen": -26.274799346923828, "logps/rejected": -32.90815734863281, "loss": 0.6145, "losses/dpo": 0.6078730225563049, "losses/sft": 1.1017650365829468, "losses/total": 0.6078730225563049, "ref_logps/chosen": -23.55907440185547, "ref_logps/rejected": -27.880718231201172, "rewards/accuracies": 0.65625, "rewards/chosen": -0.2715725004673004, "rewards/margins": 0.23117120563983917, "rewards/rejected": -0.5027437210083008, "step": 225 }, { "epoch": 1.71, "learning_rate": 2.3876404494382023e-07, "logps/chosen": -25.727689743041992, "logps/rejected": -30.410335540771484, "loss": 0.6292, "losses/dpo": 0.6031284332275391, "losses/sft": 0.7834776639938354, "losses/total": 0.6031284332275391, "ref_logps/chosen": -23.430198669433594, "ref_logps/rejected": -26.02400779724121, "rewards/accuracies": 0.6796875, "rewards/chosen": -0.22974896430969238, "rewards/margins": 0.20888389647006989, "rewards/rejected": -0.4386328458786011, "step": 226 }, { "epoch": 1.71, "learning_rate": 2.3735955056179774e-07, "logps/chosen": -25.917598724365234, "logps/rejected": -31.14261245727539, "loss": 0.5928, "losses/dpo": 0.5714601874351501, "losses/sft": 0.8888335227966309, "losses/total": 0.5714601874351501, "ref_logps/chosen": -23.436574935913086, "ref_logps/rejected": -25.756431579589844, "rewards/accuracies": 0.7265625, "rewards/chosen": -0.24810227751731873, "rewards/margins": 0.29051584005355835, "rewards/rejected": -0.5386180877685547, "step": 227 }, { "epoch": 1.72, "learning_rate": 2.3595505617977527e-07, "logps/chosen": -25.50743865966797, "logps/rejected": -34.945220947265625, "loss": 0.5505, "losses/dpo": 0.5715539455413818, "losses/sft": 0.8663308620452881, "losses/total": 0.5715539455413818, "ref_logps/chosen": -23.417984008789062, "ref_logps/rejected": -28.690208435058594, "rewards/accuracies": 0.8203125, "rewards/chosen": -0.20894566178321838, "rewards/margins": 0.41655558347702026, "rewards/rejected": -0.625501275062561, "step": 228 }, { "epoch": 1.73, "learning_rate": 2.345505617977528e-07, "logps/chosen": -23.620698928833008, "logps/rejected": -34.89327621459961, "loss": 0.571, "losses/dpo": 0.6053493022918701, "losses/sft": 0.8246825933456421, "losses/total": 0.6053493022918701, "ref_logps/chosen": -21.27004623413086, "ref_logps/rejected": -29.035568237304688, "rewards/accuracies": 0.7109375, "rewards/chosen": -0.23506540060043335, "rewards/margins": 0.35070547461509705, "rewards/rejected": -0.585770845413208, "step": 229 }, { "epoch": 1.74, "learning_rate": 2.331460674157303e-07, "logps/chosen": -21.874225616455078, "logps/rejected": -34.58841323852539, "loss": 0.5745, "losses/dpo": 0.5964910984039307, "losses/sft": 0.842921793460846, "losses/total": 0.5964910984039307, "ref_logps/chosen": -19.500164031982422, "ref_logps/rejected": -28.771209716796875, "rewards/accuracies": 0.6796875, "rewards/chosen": -0.23740598559379578, "rewards/margins": 0.34431448578834534, "rewards/rejected": -0.5817204713821411, "step": 230 }, { "epoch": 1.74, "learning_rate": 2.3174157303370786e-07, "logps/chosen": -24.84224510192871, "logps/rejected": -32.233497619628906, "loss": 0.6064, "losses/dpo": 0.5861349105834961, "losses/sft": 0.9263943433761597, "losses/total": 0.5861349105834961, "ref_logps/chosen": -21.88359832763672, "ref_logps/rejected": -26.701745986938477, "rewards/accuracies": 0.6796875, "rewards/chosen": -0.29586488008499146, "rewards/margins": 0.2573099732398987, "rewards/rejected": -0.5531748533248901, "step": 231 }, { "epoch": 1.75, "learning_rate": 2.303370786516854e-07, "logps/chosen": -25.4254207611084, "logps/rejected": -34.96025085449219, "loss": 0.5747, "losses/dpo": 0.5563768744468689, "losses/sft": 0.9355225563049316, "losses/total": 0.5563768744468689, "ref_logps/chosen": -22.772850036621094, "ref_logps/rejected": -28.902484893798828, "rewards/accuracies": 0.734375, "rewards/chosen": -0.26525697112083435, "rewards/margins": 0.3405200242996216, "rewards/rejected": -0.6057769656181335, "step": 232 }, { "epoch": 1.76, "learning_rate": 2.2893258426966292e-07, "logps/chosen": -26.856834411621094, "logps/rejected": -34.212364196777344, "loss": 0.6228, "losses/dpo": 0.6681157946586609, "losses/sft": 1.0442770719528198, "losses/total": 0.6681157946586609, "ref_logps/chosen": -23.627426147460938, "ref_logps/rejected": -28.62677001953125, "rewards/accuracies": 0.640625, "rewards/chosen": -0.32294073700904846, "rewards/margins": 0.23561875522136688, "rewards/rejected": -0.5585595369338989, "step": 233 }, { "epoch": 1.77, "learning_rate": 2.2752808988764045e-07, "logps/chosen": -26.366958618164062, "logps/rejected": -33.41276550292969, "loss": 0.6217, "losses/dpo": 0.6866650581359863, "losses/sft": 0.8693393468856812, "losses/total": 0.6866650581359863, "ref_logps/chosen": -23.189382553100586, "ref_logps/rejected": -27.676807403564453, "rewards/accuracies": 0.703125, "rewards/chosen": -0.31775763630867004, "rewards/margins": 0.25583818554878235, "rewards/rejected": -0.5735958218574524, "step": 234 }, { "epoch": 1.77, "learning_rate": 2.2612359550561795e-07, "logps/chosen": -24.26227569580078, "logps/rejected": -32.4229736328125, "loss": 0.604, "losses/dpo": 0.5642524361610413, "losses/sft": 0.9980260133743286, "losses/total": 0.5642524361610413, "ref_logps/chosen": -21.425315856933594, "ref_logps/rejected": -26.820331573486328, "rewards/accuracies": 0.609375, "rewards/chosen": -0.28369593620300293, "rewards/margins": 0.2765684127807617, "rewards/rejected": -0.5602643489837646, "step": 235 }, { "epoch": 1.78, "learning_rate": 2.2471910112359549e-07, "logps/chosen": -27.912431716918945, "logps/rejected": -31.85492706298828, "loss": 0.6448, "losses/dpo": 0.5940742492675781, "losses/sft": 0.969171404838562, "losses/total": 0.5940742492675781, "ref_logps/chosen": -24.59956932067871, "ref_logps/rejected": -26.790037155151367, "rewards/accuracies": 0.65625, "rewards/chosen": -0.3312861919403076, "rewards/margins": 0.17520278692245483, "rewards/rejected": -0.5064890384674072, "step": 236 }, { "epoch": 1.79, "learning_rate": 2.2331460674157302e-07, "logps/chosen": -27.303508758544922, "logps/rejected": -37.65882110595703, "loss": 0.5545, "losses/dpo": 0.5936781764030457, "losses/sft": 1.015429139137268, "losses/total": 0.5936781764030457, "ref_logps/chosen": -24.510639190673828, "ref_logps/rejected": -30.55707550048828, "rewards/accuracies": 0.796875, "rewards/chosen": -0.27928683161735535, "rewards/margins": 0.4308881163597107, "rewards/rejected": -0.7101750373840332, "step": 237 }, { "epoch": 1.8, "learning_rate": 2.2191011235955055e-07, "logps/chosen": -24.99541473388672, "logps/rejected": -30.256423950195312, "loss": 0.6034, "losses/dpo": 0.608791172504425, "losses/sft": 0.9114975929260254, "losses/total": 0.608791172504425, "ref_logps/chosen": -22.079914093017578, "ref_logps/rejected": -24.779722213745117, "rewards/accuracies": 0.6953125, "rewards/chosen": -0.2915502190589905, "rewards/margins": 0.2561199963092804, "rewards/rejected": -0.5476702451705933, "step": 238 }, { "epoch": 1.8, "learning_rate": 2.205056179775281e-07, "logps/chosen": -27.542556762695312, "logps/rejected": -34.17859649658203, "loss": 0.574, "losses/dpo": 0.5037014484405518, "losses/sft": 0.8922078609466553, "losses/total": 0.5037014484405518, "ref_logps/chosen": -24.774127960205078, "ref_logps/rejected": -27.759735107421875, "rewards/accuracies": 0.6875, "rewards/chosen": -0.2768429219722748, "rewards/margins": 0.3650434911251068, "rewards/rejected": -0.6418864727020264, "step": 239 }, { "epoch": 1.81, "learning_rate": 2.191011235955056e-07, "logps/chosen": -25.87149429321289, "logps/rejected": -34.46807861328125, "loss": 0.6117, "losses/dpo": 0.7050824165344238, "losses/sft": 0.9497538208961487, "losses/total": 0.7050824165344238, "ref_logps/chosen": -23.14657211303711, "ref_logps/rejected": -28.885162353515625, "rewards/accuracies": 0.6875, "rewards/chosen": -0.27249252796173096, "rewards/margins": 0.2857990562915802, "rewards/rejected": -0.5582915544509888, "step": 240 }, { "epoch": 1.82, "learning_rate": 2.1769662921348314e-07, "logps/chosen": -24.89635467529297, "logps/rejected": -33.622718811035156, "loss": 0.5808, "losses/dpo": 0.5883455276489258, "losses/sft": 0.9948925375938416, "losses/total": 0.5883455276489258, "ref_logps/chosen": -21.52194595336914, "ref_logps/rejected": -26.93505859375, "rewards/accuracies": 0.7265625, "rewards/chosen": -0.3374406695365906, "rewards/margins": 0.3313255310058594, "rewards/rejected": -0.66876620054245, "step": 241 }, { "epoch": 1.83, "learning_rate": 2.1629213483146067e-07, "logps/chosen": -24.833309173583984, "logps/rejected": -30.974327087402344, "loss": 0.6136, "losses/dpo": 0.6376237869262695, "losses/sft": 0.9374114274978638, "losses/total": 0.6376237869262695, "ref_logps/chosen": -21.7708683013916, "ref_logps/rejected": -25.24457359313965, "rewards/accuracies": 0.65625, "rewards/chosen": -0.30624428391456604, "rewards/margins": 0.2667309045791626, "rewards/rejected": -0.5729751586914062, "step": 242 }, { "epoch": 1.83, "learning_rate": 2.148876404494382e-07, "logps/chosen": -24.04471778869629, "logps/rejected": -34.610633850097656, "loss": 0.6133, "losses/dpo": 0.645912766456604, "losses/sft": 0.9913955926895142, "losses/total": 0.645912766456604, "ref_logps/chosen": -20.834651947021484, "ref_logps/rejected": -28.77642059326172, "rewards/accuracies": 0.6953125, "rewards/chosen": -0.32100653648376465, "rewards/margins": 0.26241475343704224, "rewards/rejected": -0.5834212899208069, "step": 243 }, { "epoch": 1.84, "learning_rate": 2.134831460674157e-07, "logps/chosen": -26.419416427612305, "logps/rejected": -34.56787109375, "loss": 0.5713, "losses/dpo": 0.6227866411209106, "losses/sft": 0.9809292554855347, "losses/total": 0.6227866411209106, "ref_logps/chosen": -23.278644561767578, "ref_logps/rejected": -27.596946716308594, "rewards/accuracies": 0.7265625, "rewards/chosen": -0.3140770494937897, "rewards/margins": 0.38301563262939453, "rewards/rejected": -0.6970926523208618, "step": 244 }, { "epoch": 1.85, "learning_rate": 2.1207865168539323e-07, "logps/chosen": -26.64739990234375, "logps/rejected": -33.21559524536133, "loss": 0.59, "losses/dpo": 0.6351089477539062, "losses/sft": 0.9912072420120239, "losses/total": 0.6351089477539062, "ref_logps/chosen": -23.281349182128906, "ref_logps/rejected": -26.577198028564453, "rewards/accuracies": 0.7109375, "rewards/chosen": -0.3366050124168396, "rewards/margins": 0.32723480463027954, "rewards/rejected": -0.6638398170471191, "step": 245 }, { "epoch": 1.86, "learning_rate": 2.1067415730337076e-07, "logps/chosen": -27.422582626342773, "logps/rejected": -35.08824920654297, "loss": 0.6064, "losses/dpo": 0.5233859419822693, "losses/sft": 0.8136109709739685, "losses/total": 0.5233859419822693, "ref_logps/chosen": -24.274629592895508, "ref_logps/rejected": -28.790220260620117, "rewards/accuracies": 0.71875, "rewards/chosen": -0.3147951364517212, "rewards/margins": 0.3150079846382141, "rewards/rejected": -0.6298030614852905, "step": 246 }, { "epoch": 1.86, "learning_rate": 2.0926966292134832e-07, "logps/chosen": -26.381507873535156, "logps/rejected": -31.576181411743164, "loss": 0.5829, "losses/dpo": 0.5970532894134521, "losses/sft": 0.8552703261375427, "losses/total": 0.5970532894134521, "ref_logps/chosen": -23.155136108398438, "ref_logps/rejected": -24.94633674621582, "rewards/accuracies": 0.65625, "rewards/chosen": -0.32263678312301636, "rewards/margins": 0.34034764766693115, "rewards/rejected": -0.6629844903945923, "step": 247 }, { "epoch": 1.87, "learning_rate": 2.0786516853932585e-07, "logps/chosen": -24.061811447143555, "logps/rejected": -29.508312225341797, "loss": 0.6137, "losses/dpo": 0.6248607039451599, "losses/sft": 0.8072177767753601, "losses/total": 0.6248607039451599, "ref_logps/chosen": -20.497760772705078, "ref_logps/rejected": -23.47817611694336, "rewards/accuracies": 0.625, "rewards/chosen": -0.35640496015548706, "rewards/margins": 0.24660846590995789, "rewards/rejected": -0.6030134558677673, "step": 248 }, { "epoch": 1.88, "learning_rate": 2.0646067415730336e-07, "logps/chosen": -29.165149688720703, "logps/rejected": -35.16246032714844, "loss": 0.5826, "losses/dpo": 0.5271694660186768, "losses/sft": 1.0120395421981812, "losses/total": 0.5271694660186768, "ref_logps/chosen": -25.856834411621094, "ref_logps/rejected": -28.48740005493164, "rewards/accuracies": 0.7421875, "rewards/chosen": -0.3308315873146057, "rewards/margins": 0.33667463064193726, "rewards/rejected": -0.667506217956543, "step": 249 }, { "epoch": 1.89, "learning_rate": 2.0505617977528089e-07, "logps/chosen": -26.1055965423584, "logps/rejected": -36.45195770263672, "loss": 0.5345, "losses/dpo": 0.5425952076911926, "losses/sft": 0.9156839847564697, "losses/total": 0.5425952076911926, "ref_logps/chosen": -23.200654983520508, "ref_logps/rejected": -28.661373138427734, "rewards/accuracies": 0.7578125, "rewards/chosen": -0.2904941737651825, "rewards/margins": 0.48856407403945923, "rewards/rejected": -0.7790582776069641, "step": 250 }, { "epoch": 1.89, "learning_rate": 2.0365168539325842e-07, "logps/chosen": -24.59746551513672, "logps/rejected": -36.00947570800781, "loss": 0.5622, "losses/dpo": 0.6595858335494995, "losses/sft": 0.8320033550262451, "losses/total": 0.6595858335494995, "ref_logps/chosen": -21.081745147705078, "ref_logps/rejected": -28.422481536865234, "rewards/accuracies": 0.703125, "rewards/chosen": -0.3515721559524536, "rewards/margins": 0.4071270823478699, "rewards/rejected": -0.7586992383003235, "step": 251 }, { "epoch": 1.9, "learning_rate": 2.0224719101123595e-07, "logps/chosen": -25.407838821411133, "logps/rejected": -33.07604217529297, "loss": 0.5892, "losses/dpo": 0.5324288606643677, "losses/sft": 1.0311552286148071, "losses/total": 0.5324288606643677, "ref_logps/chosen": -22.188087463378906, "ref_logps/rejected": -26.633270263671875, "rewards/accuracies": 0.65625, "rewards/chosen": -0.3219751715660095, "rewards/margins": 0.322301983833313, "rewards/rejected": -0.6442771553993225, "step": 252 }, { "epoch": 1.91, "learning_rate": 2.0084269662921348e-07, "logps/chosen": -26.190311431884766, "logps/rejected": -33.34137725830078, "loss": 0.5861, "losses/dpo": 0.6612842082977295, "losses/sft": 0.8551939129829407, "losses/total": 0.6612842082977295, "ref_logps/chosen": -22.73942756652832, "ref_logps/rejected": -26.702760696411133, "rewards/accuracies": 0.671875, "rewards/chosen": -0.345088392496109, "rewards/margins": 0.3187733292579651, "rewards/rejected": -0.6638616919517517, "step": 253 }, { "epoch": 1.92, "learning_rate": 1.9943820224719098e-07, "logps/chosen": -27.615928649902344, "logps/rejected": -33.776695251464844, "loss": 0.5511, "losses/dpo": 0.6082693338394165, "losses/sft": 1.0973209142684937, "losses/total": 0.6082693338394165, "ref_logps/chosen": -24.300251007080078, "ref_logps/rejected": -26.21588897705078, "rewards/accuracies": 0.7578125, "rewards/chosen": -0.3315678834915161, "rewards/margins": 0.42451295256614685, "rewards/rejected": -0.7560808658599854, "step": 254 }, { "epoch": 1.92, "learning_rate": 1.9803370786516854e-07, "logps/chosen": -28.257335662841797, "logps/rejected": -35.67947769165039, "loss": 0.5919, "losses/dpo": 0.6389520168304443, "losses/sft": 1.087360143661499, "losses/total": 0.6389520168304443, "ref_logps/chosen": -24.031015396118164, "ref_logps/rejected": -28.11650276184082, "rewards/accuracies": 0.6796875, "rewards/chosen": -0.4226321578025818, "rewards/margins": 0.33366525173187256, "rewards/rejected": -0.7562973499298096, "step": 255 }, { "epoch": 1.93, "learning_rate": 1.9662921348314607e-07, "logps/chosen": -27.326435089111328, "logps/rejected": -33.91853713989258, "loss": 0.5884, "losses/dpo": 0.5772832632064819, "losses/sft": 1.0057258605957031, "losses/total": 0.5772832632064819, "ref_logps/chosen": -23.13665008544922, "ref_logps/rejected": -26.448516845703125, "rewards/accuracies": 0.6796875, "rewards/chosen": -0.4189784526824951, "rewards/margins": 0.32802364230155945, "rewards/rejected": -0.747002124786377, "step": 256 }, { "epoch": 1.94, "learning_rate": 1.952247191011236e-07, "logps/chosen": -24.134462356567383, "logps/rejected": -34.923095703125, "loss": 0.5245, "losses/dpo": 0.5826983451843262, "losses/sft": 0.7670709490776062, "losses/total": 0.5826983451843262, "ref_logps/chosen": -21.35719108581543, "ref_logps/rejected": -27.226768493652344, "rewards/accuracies": 0.7578125, "rewards/chosen": -0.2777270972728729, "rewards/margins": 0.49190521240234375, "rewards/rejected": -0.7696323394775391, "step": 257 }, { "epoch": 1.95, "learning_rate": 1.938202247191011e-07, "logps/chosen": -24.894744873046875, "logps/rejected": -36.34782791137695, "loss": 0.5654, "losses/dpo": 0.5832593441009521, "losses/sft": 0.8260340094566345, "losses/total": 0.5832593441009521, "ref_logps/chosen": -21.5096435546875, "ref_logps/rejected": -28.905296325683594, "rewards/accuracies": 0.7734375, "rewards/chosen": -0.3385101854801178, "rewards/margins": 0.4057431221008301, "rewards/rejected": -0.7442533373832703, "step": 258 }, { "epoch": 1.95, "learning_rate": 1.9241573033707863e-07, "logps/chosen": -29.12051773071289, "logps/rejected": -33.72222900390625, "loss": 0.6189, "losses/dpo": 0.5586456060409546, "losses/sft": 1.1363164186477661, "losses/total": 0.5586456060409546, "ref_logps/chosen": -24.818958282470703, "ref_logps/rejected": -26.637435913085938, "rewards/accuracies": 0.6875, "rewards/chosen": -0.430155873298645, "rewards/margins": 0.27832359075546265, "rewards/rejected": -0.7084795236587524, "step": 259 }, { "epoch": 1.96, "learning_rate": 1.9101123595505617e-07, "logps/chosen": -25.77654266357422, "logps/rejected": -32.80144119262695, "loss": 0.5647, "losses/dpo": 0.6132915616035461, "losses/sft": 0.8355939984321594, "losses/total": 0.6132915616035461, "ref_logps/chosen": -22.049232482910156, "ref_logps/rejected": -25.218961715698242, "rewards/accuracies": 0.765625, "rewards/chosen": -0.37273097038269043, "rewards/margins": 0.3855169415473938, "rewards/rejected": -0.758247971534729, "step": 260 }, { "epoch": 1.97, "learning_rate": 1.896067415730337e-07, "logps/chosen": -27.173105239868164, "logps/rejected": -33.18789291381836, "loss": 0.5757, "losses/dpo": 0.6402326822280884, "losses/sft": 0.9358000159263611, "losses/total": 0.6402326822280884, "ref_logps/chosen": -24.05023956298828, "ref_logps/rejected": -26.557050704956055, "rewards/accuracies": 0.6796875, "rewards/chosen": -0.3122865557670593, "rewards/margins": 0.3507978618144989, "rewards/rejected": -0.6630844473838806, "step": 261 }, { "epoch": 1.98, "learning_rate": 1.8820224719101123e-07, "logps/chosen": -25.127092361450195, "logps/rejected": -34.0608024597168, "loss": 0.5844, "losses/dpo": 0.576771080493927, "losses/sft": 0.8823024034500122, "losses/total": 0.576771080493927, "ref_logps/chosen": -21.552627563476562, "ref_logps/rejected": -27.01715087890625, "rewards/accuracies": 0.671875, "rewards/chosen": -0.35744667053222656, "rewards/margins": 0.346918523311615, "rewards/rejected": -0.7043651938438416, "step": 262 }, { "epoch": 1.98, "learning_rate": 1.8679775280898876e-07, "logps/chosen": -25.840179443359375, "logps/rejected": -34.11262893676758, "loss": 0.5675, "losses/dpo": 0.5643225312232971, "losses/sft": 0.7924672365188599, "losses/total": 0.5643225312232971, "ref_logps/chosen": -22.366439819335938, "ref_logps/rejected": -26.873088836669922, "rewards/accuracies": 0.7109375, "rewards/chosen": -0.3473738133907318, "rewards/margins": 0.37658050656318665, "rewards/rejected": -0.7239543199539185, "step": 263 }, { "epoch": 1.99, "learning_rate": 1.853932584269663e-07, "logps/chosen": -24.64289093017578, "logps/rejected": -30.944011688232422, "loss": 0.5768, "losses/dpo": 0.6149911880493164, "losses/sft": 0.9512190222740173, "losses/total": 0.6149911880493164, "ref_logps/chosen": -21.270837783813477, "ref_logps/rejected": -24.173620223999023, "rewards/accuracies": 0.734375, "rewards/chosen": -0.33720535039901733, "rewards/margins": 0.3398338854312897, "rewards/rejected": -0.6770392656326294, "step": 264 }, { "epoch": 2.0, "learning_rate": 1.8398876404494382e-07, "logps/chosen": -26.05956268310547, "logps/rejected": -35.905609130859375, "loss": 0.5407, "losses/dpo": 0.49823397397994995, "losses/sft": 0.8145182132720947, "losses/total": 0.49823397397994995, "ref_logps/chosen": -23.054027557373047, "ref_logps/rejected": -28.27884864807129, "rewards/accuracies": 0.796875, "rewards/chosen": -0.3005535304546356, "rewards/margins": 0.4621226191520691, "rewards/rejected": -0.7626761794090271, "step": 265 }, { "epoch": 2.01, "learning_rate": 1.8258426966292135e-07, "logps/chosen": -24.907108306884766, "logps/rejected": -33.71357345581055, "loss": 0.5301, "losses/dpo": 0.49924543499946594, "losses/sft": 0.9444026350975037, "losses/total": 0.49924543499946594, "ref_logps/chosen": -21.562763214111328, "ref_logps/rejected": -25.69823455810547, "rewards/accuracies": 0.7578125, "rewards/chosen": -0.3344343304634094, "rewards/margins": 0.4670996069908142, "rewards/rejected": -0.8015338778495789, "step": 266 }, { "epoch": 2.02, "learning_rate": 1.8117977528089888e-07, "logps/chosen": -24.61281967163086, "logps/rejected": -33.178436279296875, "loss": 0.5843, "losses/dpo": 0.6827691793441772, "losses/sft": 0.9820384979248047, "losses/total": 0.6827691793441772, "ref_logps/chosen": -20.769065856933594, "ref_logps/rejected": -25.740190505981445, "rewards/accuracies": 0.6953125, "rewards/chosen": -0.3843753933906555, "rewards/margins": 0.35944926738739014, "rewards/rejected": -0.7438246607780457, "step": 267 }, { "epoch": 2.02, "learning_rate": 1.7977528089887638e-07, "logps/chosen": -25.742042541503906, "logps/rejected": -31.92254638671875, "loss": 0.5765, "losses/dpo": 0.48391562700271606, "losses/sft": 0.9694733619689941, "losses/total": 0.48391562700271606, "ref_logps/chosen": -22.27023696899414, "ref_logps/rejected": -24.869842529296875, "rewards/accuracies": 0.734375, "rewards/chosen": -0.34718072414398193, "rewards/margins": 0.3580899238586426, "rewards/rejected": -0.7052706480026245, "step": 268 }, { "epoch": 2.03, "learning_rate": 1.7837078651685391e-07, "logps/chosen": -24.76668930053711, "logps/rejected": -33.92596435546875, "loss": 0.5197, "losses/dpo": 0.566383957862854, "losses/sft": 1.056198239326477, "losses/total": 0.566383957862854, "ref_logps/chosen": -21.79462432861328, "ref_logps/rejected": -26.1394100189209, "rewards/accuracies": 0.796875, "rewards/chosen": -0.2972065806388855, "rewards/margins": 0.48144853115081787, "rewards/rejected": -0.7786551713943481, "step": 269 }, { "epoch": 2.04, "learning_rate": 1.7696629213483144e-07, "logps/chosen": -25.022621154785156, "logps/rejected": -32.83625030517578, "loss": 0.554, "losses/dpo": 0.5455434322357178, "losses/sft": 0.9091237783432007, "losses/total": 0.5455434322357178, "ref_logps/chosen": -21.205692291259766, "ref_logps/rejected": -24.853519439697266, "rewards/accuracies": 0.765625, "rewards/chosen": -0.3816927969455719, "rewards/margins": 0.4165803790092468, "rewards/rejected": -0.7982731461524963, "step": 270 }, { "epoch": 2.05, "learning_rate": 1.75561797752809e-07, "logps/chosen": -27.038639068603516, "logps/rejected": -35.007415771484375, "loss": 0.5526, "losses/dpo": 0.7876778841018677, "losses/sft": 1.1023296117782593, "losses/total": 0.7876778841018677, "ref_logps/chosen": -23.039878845214844, "ref_logps/rejected": -26.884708404541016, "rewards/accuracies": 0.78125, "rewards/chosen": -0.39987578988075256, "rewards/margins": 0.41239458322525024, "rewards/rejected": -0.8122704029083252, "step": 271 }, { "epoch": 2.05, "learning_rate": 1.741573033707865e-07, "logps/chosen": -26.300579071044922, "logps/rejected": -36.53676223754883, "loss": 0.5444, "losses/dpo": 0.4805631637573242, "losses/sft": 0.8787716031074524, "losses/total": 0.4805631637573242, "ref_logps/chosen": -22.55372428894043, "ref_logps/rejected": -27.711929321289062, "rewards/accuracies": 0.7578125, "rewards/chosen": -0.3746855556964874, "rewards/margins": 0.5077978372573853, "rewards/rejected": -0.8824833631515503, "step": 272 }, { "epoch": 2.06, "learning_rate": 1.7275280898876404e-07, "logps/chosen": -27.612911224365234, "logps/rejected": -39.48854064941406, "loss": 0.4883, "losses/dpo": 0.5499591827392578, "losses/sft": 1.1995720863342285, "losses/total": 0.5499591827392578, "ref_logps/chosen": -23.866586685180664, "ref_logps/rejected": -29.748516082763672, "rewards/accuracies": 0.828125, "rewards/chosen": -0.37463241815567017, "rewards/margins": 0.5993699431419373, "rewards/rejected": -0.9740023612976074, "step": 273 }, { "epoch": 2.07, "learning_rate": 1.7134831460674157e-07, "logps/chosen": -28.848485946655273, "logps/rejected": -38.46211242675781, "loss": 0.5223, "losses/dpo": 0.5853086113929749, "losses/sft": 0.9450937509536743, "losses/total": 0.5853086113929749, "ref_logps/chosen": -24.71368980407715, "ref_logps/rejected": -29.095096588134766, "rewards/accuracies": 0.796875, "rewards/chosen": -0.4134795069694519, "rewards/margins": 0.5232220888137817, "rewards/rejected": -0.9367015957832336, "step": 274 }, { "epoch": 2.08, "learning_rate": 1.699438202247191e-07, "logps/chosen": -26.53584861755371, "logps/rejected": -33.2642707824707, "loss": 0.5583, "losses/dpo": 0.6550332307815552, "losses/sft": 0.844421923160553, "losses/total": 0.6550332307815552, "ref_logps/chosen": -22.528621673583984, "ref_logps/rejected": -25.07666778564453, "rewards/accuracies": 0.765625, "rewards/chosen": -0.4007226824760437, "rewards/margins": 0.4180375933647156, "rewards/rejected": -0.8187602758407593, "step": 275 }, { "epoch": 2.08, "learning_rate": 1.6853932584269663e-07, "logps/chosen": -26.93305778503418, "logps/rejected": -36.43919372558594, "loss": 0.5267, "losses/dpo": 0.37509262561798096, "losses/sft": 0.9286944270133972, "losses/total": 0.37509262561798096, "ref_logps/chosen": -22.965662002563477, "ref_logps/rejected": -27.516300201416016, "rewards/accuracies": 0.765625, "rewards/chosen": -0.39673954248428345, "rewards/margins": 0.4955495595932007, "rewards/rejected": -0.8922891616821289, "step": 276 }, { "epoch": 2.09, "learning_rate": 1.6713483146067413e-07, "logps/chosen": -27.517230987548828, "logps/rejected": -33.23160934448242, "loss": 0.585, "losses/dpo": 0.45891374349594116, "losses/sft": 0.8818660378456116, "losses/total": 0.45891374349594116, "ref_logps/chosen": -23.37508773803711, "ref_logps/rejected": -25.319503784179688, "rewards/accuracies": 0.6875, "rewards/chosen": -0.4142143726348877, "rewards/margins": 0.37699633836746216, "rewards/rejected": -0.7912107110023499, "step": 277 }, { "epoch": 2.1, "learning_rate": 1.6573033707865166e-07, "logps/chosen": -29.848949432373047, "logps/rejected": -37.009605407714844, "loss": 0.5569, "losses/dpo": 0.6695871353149414, "losses/sft": 1.1478632688522339, "losses/total": 0.6695871353149414, "ref_logps/chosen": -25.79513168334961, "ref_logps/rejected": -28.52492332458496, "rewards/accuracies": 0.7421875, "rewards/chosen": -0.4053817391395569, "rewards/margins": 0.4430864751338959, "rewards/rejected": -0.8484681844711304, "step": 278 }, { "epoch": 2.11, "learning_rate": 1.6432584269662922e-07, "logps/chosen": -26.847461700439453, "logps/rejected": -33.84664535522461, "loss": 0.5853, "losses/dpo": 0.6266674995422363, "losses/sft": 0.9419240951538086, "losses/total": 0.6266674995422363, "ref_logps/chosen": -23.1273193359375, "ref_logps/rejected": -26.497238159179688, "rewards/accuracies": 0.671875, "rewards/chosen": -0.3720143437385559, "rewards/margins": 0.3629264533519745, "rewards/rejected": -0.734940767288208, "step": 279 }, { "epoch": 2.11, "learning_rate": 1.6292134831460675e-07, "logps/chosen": -25.227951049804688, "logps/rejected": -37.79768371582031, "loss": 0.5277, "losses/dpo": 0.5965819358825684, "losses/sft": 1.0364360809326172, "losses/total": 0.5965819358825684, "ref_logps/chosen": -20.85896873474121, "ref_logps/rejected": -28.49602508544922, "rewards/accuracies": 0.7890625, "rewards/chosen": -0.43689805269241333, "rewards/margins": 0.49326756596565247, "rewards/rejected": -0.9301656484603882, "step": 280 }, { "epoch": 2.12, "learning_rate": 1.6151685393258428e-07, "logps/chosen": -25.945070266723633, "logps/rejected": -35.28973388671875, "loss": 0.5305, "losses/dpo": 0.5456879138946533, "losses/sft": 0.8692267537117004, "losses/total": 0.5456879138946533, "ref_logps/chosen": -22.303909301757812, "ref_logps/rejected": -26.77994155883789, "rewards/accuracies": 0.75, "rewards/chosen": -0.36411628127098083, "rewards/margins": 0.4868628680706024, "rewards/rejected": -0.850979208946228, "step": 281 }, { "epoch": 2.13, "learning_rate": 1.6011235955056178e-07, "logps/chosen": -28.660266876220703, "logps/rejected": -36.41142272949219, "loss": 0.5766, "losses/dpo": 0.6054384708404541, "losses/sft": 0.9599564671516418, "losses/total": 0.6054384708404541, "ref_logps/chosen": -24.156015396118164, "ref_logps/rejected": -28.24047088623047, "rewards/accuracies": 0.734375, "rewards/chosen": -0.4504254460334778, "rewards/margins": 0.36666956543922424, "rewards/rejected": -0.8170950412750244, "step": 282 }, { "epoch": 2.14, "learning_rate": 1.5870786516853931e-07, "logps/chosen": -27.74228858947754, "logps/rejected": -38.50691604614258, "loss": 0.5215, "losses/dpo": 0.463223397731781, "losses/sft": 1.041387915611267, "losses/total": 0.463223397731781, "ref_logps/chosen": -23.603931427001953, "ref_logps/rejected": -28.803909301757812, "rewards/accuracies": 0.75, "rewards/chosen": -0.4138358533382416, "rewards/margins": 0.5564644932746887, "rewards/rejected": -0.9703004360198975, "step": 283 }, { "epoch": 2.14, "learning_rate": 1.5730337078651685e-07, "logps/chosen": -24.93131446838379, "logps/rejected": -35.91729736328125, "loss": 0.5266, "losses/dpo": 0.6279169321060181, "losses/sft": 0.8709256052970886, "losses/total": 0.6279169321060181, "ref_logps/chosen": -20.768774032592773, "ref_logps/rejected": -26.83118438720703, "rewards/accuracies": 0.7734375, "rewards/chosen": -0.4162542223930359, "rewards/margins": 0.4923573136329651, "rewards/rejected": -0.9086115956306458, "step": 284 }, { "epoch": 2.15, "learning_rate": 1.5589887640449438e-07, "logps/chosen": -27.571338653564453, "logps/rejected": -38.57915496826172, "loss": 0.5687, "losses/dpo": 0.5966840386390686, "losses/sft": 0.9412966966629028, "losses/total": 0.5966840386390686, "ref_logps/chosen": -22.682205200195312, "ref_logps/rejected": -29.58980369567871, "rewards/accuracies": 0.7265625, "rewards/chosen": -0.48891347646713257, "rewards/margins": 0.4100216031074524, "rewards/rejected": -0.898935079574585, "step": 285 }, { "epoch": 2.16, "learning_rate": 1.5449438202247188e-07, "logps/chosen": -25.46674346923828, "logps/rejected": -33.395118713378906, "loss": 0.5902, "losses/dpo": 0.7199227213859558, "losses/sft": 0.9989073276519775, "losses/total": 0.7199227213859558, "ref_logps/chosen": -21.129976272583008, "ref_logps/rejected": -25.61261749267578, "rewards/accuracies": 0.6875, "rewards/chosen": -0.4336766302585602, "rewards/margins": 0.3445735573768616, "rewards/rejected": -0.7782501578330994, "step": 286 }, { "epoch": 2.17, "learning_rate": 1.5308988764044944e-07, "logps/chosen": -26.559568405151367, "logps/rejected": -39.50514221191406, "loss": 0.5101, "losses/dpo": 0.42156773805618286, "losses/sft": 0.824786365032196, "losses/total": 0.42156773805618286, "ref_logps/chosen": -21.88895606994629, "ref_logps/rejected": -29.256093978881836, "rewards/accuracies": 0.8046875, "rewards/chosen": -0.46706122159957886, "rewards/margins": 0.557843804359436, "rewards/rejected": -1.0249050855636597, "step": 287 }, { "epoch": 2.17, "learning_rate": 1.5168539325842697e-07, "logps/chosen": -27.430574417114258, "logps/rejected": -35.846214294433594, "loss": 0.5852, "losses/dpo": 0.7073966264724731, "losses/sft": 0.959773600101471, "losses/total": 0.7073966264724731, "ref_logps/chosen": -22.33085060119629, "ref_logps/rejected": -26.85816192626953, "rewards/accuracies": 0.7109375, "rewards/chosen": -0.509972333908081, "rewards/margins": 0.38883259892463684, "rewards/rejected": -0.8988049030303955, "step": 288 }, { "epoch": 2.18, "learning_rate": 1.502808988764045e-07, "logps/chosen": -25.799861907958984, "logps/rejected": -37.50861358642578, "loss": 0.5553, "losses/dpo": 0.5419721603393555, "losses/sft": 0.940202534198761, "losses/total": 0.5419721603393555, "ref_logps/chosen": -21.224933624267578, "ref_logps/rejected": -28.002174377441406, "rewards/accuracies": 0.6875, "rewards/chosen": -0.4574929475784302, "rewards/margins": 0.4931509494781494, "rewards/rejected": -0.9506438970565796, "step": 289 }, { "epoch": 2.19, "learning_rate": 1.4887640449438203e-07, "logps/chosen": -27.79110336303711, "logps/rejected": -34.21430206298828, "loss": 0.5921, "losses/dpo": 0.6595107913017273, "losses/sft": 1.0057413578033447, "losses/total": 0.6595107913017273, "ref_logps/chosen": -23.073078155517578, "ref_logps/rejected": -26.093578338623047, "rewards/accuracies": 0.71875, "rewards/chosen": -0.4718025326728821, "rewards/margins": 0.34026968479156494, "rewards/rejected": -0.8120721578598022, "step": 290 }, { "epoch": 2.2, "learning_rate": 1.4747191011235953e-07, "logps/chosen": -26.380355834960938, "logps/rejected": -37.56932830810547, "loss": 0.5263, "losses/dpo": 0.47728973627090454, "losses/sft": 1.0133030414581299, "losses/total": 0.47728973627090454, "ref_logps/chosen": -22.488906860351562, "ref_logps/rejected": -28.34372329711914, "rewards/accuracies": 0.71875, "rewards/chosen": -0.38914480805397034, "rewards/margins": 0.5334160327911377, "rewards/rejected": -0.9225608110427856, "step": 291 }, { "epoch": 2.2, "learning_rate": 1.4606741573033706e-07, "logps/chosen": -27.006134033203125, "logps/rejected": -37.092594146728516, "loss": 0.5417, "losses/dpo": 0.7257384061813354, "losses/sft": 1.2120591402053833, "losses/total": 0.7257384061813354, "ref_logps/chosen": -22.130794525146484, "ref_logps/rejected": -27.460662841796875, "rewards/accuracies": 0.75, "rewards/chosen": -0.48753368854522705, "rewards/margins": 0.47565943002700806, "rewards/rejected": -0.9631930589675903, "step": 292 }, { "epoch": 2.21, "learning_rate": 1.446629213483146e-07, "logps/chosen": -25.57880210876465, "logps/rejected": -37.16014099121094, "loss": 0.5381, "losses/dpo": 0.6313049793243408, "losses/sft": 0.9201721549034119, "losses/total": 0.6313049793243408, "ref_logps/chosen": -21.904296875, "ref_logps/rejected": -28.32394027709961, "rewards/accuracies": 0.6953125, "rewards/chosen": -0.3674505054950714, "rewards/margins": 0.5161697268486023, "rewards/rejected": -0.8836201429367065, "step": 293 }, { "epoch": 2.22, "learning_rate": 1.4325842696629212e-07, "logps/chosen": -24.601829528808594, "logps/rejected": -37.283538818359375, "loss": 0.5429, "losses/dpo": 0.5216307044029236, "losses/sft": 1.0138075351715088, "losses/total": 0.5216307044029236, "ref_logps/chosen": -20.36395263671875, "ref_logps/rejected": -28.042282104492188, "rewards/accuracies": 0.7734375, "rewards/chosen": -0.4237874746322632, "rewards/margins": 0.5003381967544556, "rewards/rejected": -0.9241256713867188, "step": 294 }, { "epoch": 2.23, "learning_rate": 1.4185393258426968e-07, "logps/chosen": -28.58258819580078, "logps/rejected": -36.47199249267578, "loss": 0.5894, "losses/dpo": 0.43838924169540405, "losses/sft": 1.2099077701568604, "losses/total": 0.43838924169540405, "ref_logps/chosen": -23.226455688476562, "ref_logps/rejected": -26.880578994750977, "rewards/accuracies": 0.6953125, "rewards/chosen": -0.5356131792068481, "rewards/margins": 0.423528254032135, "rewards/rejected": -0.9591414332389832, "step": 295 }, { "epoch": 2.23, "learning_rate": 1.4044943820224718e-07, "logps/chosen": -28.39300537109375, "logps/rejected": -36.5651741027832, "loss": 0.5472, "losses/dpo": 0.6286274790763855, "losses/sft": 1.1655751466751099, "losses/total": 0.6286274790763855, "ref_logps/chosen": -23.700767517089844, "ref_logps/rejected": -26.989849090576172, "rewards/accuracies": 0.7265625, "rewards/chosen": -0.4692240357398987, "rewards/margins": 0.48830845952033997, "rewards/rejected": -0.9575324058532715, "step": 296 }, { "epoch": 2.24, "learning_rate": 1.3904494382022472e-07, "logps/chosen": -27.01761245727539, "logps/rejected": -34.147830963134766, "loss": 0.5581, "losses/dpo": 0.425361692905426, "losses/sft": 1.129596471786499, "losses/total": 0.425361692905426, "ref_logps/chosen": -22.251991271972656, "ref_logps/rejected": -24.953710556030273, "rewards/accuracies": 0.671875, "rewards/chosen": -0.47656214237213135, "rewards/margins": 0.4428498148918152, "rewards/rejected": -0.9194119572639465, "step": 297 }, { "epoch": 2.25, "learning_rate": 1.3764044943820225e-07, "logps/chosen": -27.92938995361328, "logps/rejected": -39.81676483154297, "loss": 0.5111, "losses/dpo": 0.5578055381774902, "losses/sft": 1.1197444200515747, "losses/total": 0.5578055381774902, "ref_logps/chosen": -23.314014434814453, "ref_logps/rejected": -29.507539749145508, "rewards/accuracies": 0.78125, "rewards/chosen": -0.4615376591682434, "rewards/margins": 0.5693849325180054, "rewards/rejected": -1.0309226512908936, "step": 298 }, { "epoch": 2.26, "learning_rate": 1.3623595505617978e-07, "logps/chosen": -27.910144805908203, "logps/rejected": -35.69133758544922, "loss": 0.5499, "losses/dpo": 0.4847102165222168, "losses/sft": 0.989621639251709, "losses/total": 0.4847102165222168, "ref_logps/chosen": -23.326908111572266, "ref_logps/rejected": -26.762676239013672, "rewards/accuracies": 0.734375, "rewards/chosen": -0.45832377672195435, "rewards/margins": 0.43454277515411377, "rewards/rejected": -0.8928664922714233, "step": 299 }, { "epoch": 2.26, "learning_rate": 1.3483146067415728e-07, "logps/chosen": -28.233020782470703, "logps/rejected": -37.3542366027832, "loss": 0.5935, "losses/dpo": 0.5905570983886719, "losses/sft": 1.0464057922363281, "losses/total": 0.5905570983886719, "ref_logps/chosen": -23.20905303955078, "ref_logps/rejected": -28.49638557434082, "rewards/accuracies": 0.71875, "rewards/chosen": -0.5023964643478394, "rewards/margins": 0.38338857889175415, "rewards/rejected": -0.8857850432395935, "step": 300 }, { "epoch": 2.27, "learning_rate": 1.334269662921348e-07, "logps/chosen": -29.44438934326172, "logps/rejected": -36.25569152832031, "loss": 0.5608, "losses/dpo": 0.5518324375152588, "losses/sft": 0.9761526584625244, "losses/total": 0.5518324375152588, "ref_logps/chosen": -24.553022384643555, "ref_logps/rejected": -26.846464157104492, "rewards/accuracies": 0.7109375, "rewards/chosen": -0.4891367554664612, "rewards/margins": 0.451786071062088, "rewards/rejected": -0.9409228563308716, "step": 301 }, { "epoch": 2.28, "learning_rate": 1.3202247191011234e-07, "logps/chosen": -29.32979393005371, "logps/rejected": -37.83529281616211, "loss": 0.5463, "losses/dpo": 0.5125599503517151, "losses/sft": 0.9747940897941589, "losses/total": 0.5125599503517151, "ref_logps/chosen": -24.211776733398438, "ref_logps/rejected": -27.92011260986328, "rewards/accuracies": 0.734375, "rewards/chosen": -0.5118017196655273, "rewards/margins": 0.47971609234809875, "rewards/rejected": -0.9915178418159485, "step": 302 }, { "epoch": 2.29, "learning_rate": 1.306179775280899e-07, "logps/chosen": -27.11697769165039, "logps/rejected": -35.418338775634766, "loss": 0.575, "losses/dpo": 0.5703020095825195, "losses/sft": 0.9395530223846436, "losses/total": 0.5703020095825195, "ref_logps/chosen": -22.12503433227539, "ref_logps/rejected": -26.28829574584961, "rewards/accuracies": 0.7265625, "rewards/chosen": -0.49919426441192627, "rewards/margins": 0.4138101041316986, "rewards/rejected": -0.9130042791366577, "step": 303 }, { "epoch": 2.29, "learning_rate": 1.2921348314606743e-07, "logps/chosen": -31.24747085571289, "logps/rejected": -38.8961181640625, "loss": 0.5843, "losses/dpo": 0.4914831221103668, "losses/sft": 0.9517439603805542, "losses/total": 0.4914831221103668, "ref_logps/chosen": -25.351207733154297, "ref_logps/rejected": -29.048057556152344, "rewards/accuracies": 0.734375, "rewards/chosen": -0.5896263122558594, "rewards/margins": 0.3951793909072876, "rewards/rejected": -0.9848057627677917, "step": 304 }, { "epoch": 2.3, "learning_rate": 1.2780898876404493e-07, "logps/chosen": -27.757152557373047, "logps/rejected": -37.481964111328125, "loss": 0.5261, "losses/dpo": 0.4620394706726074, "losses/sft": 1.0134756565093994, "losses/total": 0.4620394706726074, "ref_logps/chosen": -23.13431167602539, "ref_logps/rejected": -27.492977142333984, "rewards/accuracies": 0.75, "rewards/chosen": -0.4622839689254761, "rewards/margins": 0.5366144776344299, "rewards/rejected": -0.9988985061645508, "step": 305 }, { "epoch": 2.31, "learning_rate": 1.2640449438202246e-07, "logps/chosen": -26.990705490112305, "logps/rejected": -34.95043182373047, "loss": 0.5636, "losses/dpo": 0.5714951753616333, "losses/sft": 0.9859296679496765, "losses/total": 0.5714951753616333, "ref_logps/chosen": -22.465744018554688, "ref_logps/rejected": -26.120864868164062, "rewards/accuracies": 0.7421875, "rewards/chosen": -0.452495813369751, "rewards/margins": 0.4304611086845398, "rewards/rejected": -0.882956862449646, "step": 306 }, { "epoch": 2.32, "learning_rate": 1.25e-07, "logps/chosen": -29.839576721191406, "logps/rejected": -40.363712310791016, "loss": 0.5508, "losses/dpo": 0.5849748253822327, "losses/sft": 0.9925932288169861, "losses/total": 0.5849748253822327, "ref_logps/chosen": -24.343534469604492, "ref_logps/rejected": -29.49897003173828, "rewards/accuracies": 0.75, "rewards/chosen": -0.5496042370796204, "rewards/margins": 0.5368699431419373, "rewards/rejected": -1.0864741802215576, "step": 307 }, { "epoch": 2.32, "learning_rate": 1.2359550561797752e-07, "logps/chosen": -26.925609588623047, "logps/rejected": -35.485931396484375, "loss": 0.5537, "losses/dpo": 0.43898260593414307, "losses/sft": 0.8520787954330444, "losses/total": 0.43898260593414307, "ref_logps/chosen": -22.369102478027344, "ref_logps/rejected": -26.426111221313477, "rewards/accuracies": 0.7734375, "rewards/chosen": -0.4556504487991333, "rewards/margins": 0.450331449508667, "rewards/rejected": -0.9059818387031555, "step": 308 }, { "epoch": 2.33, "learning_rate": 1.2219101123595506e-07, "logps/chosen": -27.308425903320312, "logps/rejected": -34.86455535888672, "loss": 0.6099, "losses/dpo": 0.6877168416976929, "losses/sft": 0.8925371766090393, "losses/total": 0.6877168416976929, "ref_logps/chosen": -22.015674591064453, "ref_logps/rejected": -26.024978637695312, "rewards/accuracies": 0.6796875, "rewards/chosen": -0.5292750597000122, "rewards/margins": 0.3546826243400574, "rewards/rejected": -0.8839576840400696, "step": 309 }, { "epoch": 2.34, "learning_rate": 1.2078651685393259e-07, "logps/chosen": -27.23873519897461, "logps/rejected": -34.355613708496094, "loss": 0.5451, "losses/dpo": 0.4608323574066162, "losses/sft": 1.068372130393982, "losses/total": 0.4608323574066162, "ref_logps/chosen": -22.93021011352539, "ref_logps/rejected": -25.29645538330078, "rewards/accuracies": 0.7421875, "rewards/chosen": -0.4308522939682007, "rewards/margins": 0.4750638008117676, "rewards/rejected": -0.9059160947799683, "step": 310 }, { "epoch": 2.35, "learning_rate": 1.1938202247191012e-07, "logps/chosen": -27.410297393798828, "logps/rejected": -36.60034942626953, "loss": 0.5435, "losses/dpo": 0.49991002678871155, "losses/sft": 0.9416501522064209, "losses/total": 0.49991002678871155, "ref_logps/chosen": -22.95693016052246, "ref_logps/rejected": -27.357826232910156, "rewards/accuracies": 0.765625, "rewards/chosen": -0.4453369379043579, "rewards/margins": 0.47891533374786377, "rewards/rejected": -0.9242523312568665, "step": 311 }, { "epoch": 2.35, "learning_rate": 1.1797752808988763e-07, "logps/chosen": -26.075031280517578, "logps/rejected": -35.989906311035156, "loss": 0.5108, "losses/dpo": 0.49980589747428894, "losses/sft": 0.8830540776252747, "losses/total": 0.49980589747428894, "ref_logps/chosen": -21.97342300415039, "ref_logps/rejected": -26.20843505859375, "rewards/accuracies": 0.78125, "rewards/chosen": -0.4101608395576477, "rewards/margins": 0.5679866075515747, "rewards/rejected": -0.9781473875045776, "step": 312 }, { "epoch": 2.36, "learning_rate": 1.1657303370786515e-07, "logps/chosen": -27.034866333007812, "logps/rejected": -36.756141662597656, "loss": 0.535, "losses/dpo": 0.5506036281585693, "losses/sft": 0.842628002166748, "losses/total": 0.5506036281585693, "ref_logps/chosen": -22.406818389892578, "ref_logps/rejected": -26.883403778076172, "rewards/accuracies": 0.7734375, "rewards/chosen": -0.4628047049045563, "rewards/margins": 0.5244689583778381, "rewards/rejected": -0.9872736930847168, "step": 313 }, { "epoch": 2.37, "learning_rate": 1.151685393258427e-07, "logps/chosen": -24.555809020996094, "logps/rejected": -32.58570861816406, "loss": 0.6002, "losses/dpo": 0.643724262714386, "losses/sft": 0.86636883020401, "losses/total": 0.643724262714386, "ref_logps/chosen": -20.17746353149414, "ref_logps/rejected": -24.76491928100586, "rewards/accuracies": 0.6640625, "rewards/chosen": -0.4378345012664795, "rewards/margins": 0.3442443907260895, "rewards/rejected": -0.7820788621902466, "step": 314 }, { "epoch": 2.38, "learning_rate": 1.1376404494382023e-07, "logps/chosen": -23.88672637939453, "logps/rejected": -34.9397087097168, "loss": 0.529, "losses/dpo": 0.5676740407943726, "losses/sft": 0.8977797627449036, "losses/total": 0.5676740407943726, "ref_logps/chosen": -19.26026153564453, "ref_logps/rejected": -25.219451904296875, "rewards/accuracies": 0.7578125, "rewards/chosen": -0.4626464247703552, "rewards/margins": 0.5093792676925659, "rewards/rejected": -0.9720257520675659, "step": 315 }, { "epoch": 2.38, "learning_rate": 1.1235955056179774e-07, "logps/chosen": -28.308603286743164, "logps/rejected": -38.62759017944336, "loss": 0.5376, "losses/dpo": 0.4975647032260895, "losses/sft": 1.098832130432129, "losses/total": 0.4975647032260895, "ref_logps/chosen": -22.983192443847656, "ref_logps/rejected": -28.174081802368164, "rewards/accuracies": 0.75, "rewards/chosen": -0.5325409173965454, "rewards/margins": 0.5128099918365479, "rewards/rejected": -1.0453509092330933, "step": 316 }, { "epoch": 2.39, "learning_rate": 1.1095505617977527e-07, "logps/chosen": -29.64281463623047, "logps/rejected": -34.92308807373047, "loss": 0.6085, "losses/dpo": 0.6205140352249146, "losses/sft": 1.0714130401611328, "losses/total": 0.6205140352249146, "ref_logps/chosen": -23.70892333984375, "ref_logps/rejected": -25.790882110595703, "rewards/accuracies": 0.6875, "rewards/chosen": -0.5933888554573059, "rewards/margins": 0.31983205676078796, "rewards/rejected": -0.9132209420204163, "step": 317 }, { "epoch": 2.4, "learning_rate": 1.095505617977528e-07, "logps/chosen": -27.25480079650879, "logps/rejected": -35.160831451416016, "loss": 0.5594, "losses/dpo": 0.46756136417388916, "losses/sft": 1.0184146165847778, "losses/total": 0.46756136417388916, "ref_logps/chosen": -22.160459518432617, "ref_logps/rejected": -25.47940444946289, "rewards/accuracies": 0.7109375, "rewards/chosen": -0.5094340443611145, "rewards/margins": 0.45870864391326904, "rewards/rejected": -0.9681426882743835, "step": 318 }, { "epoch": 2.41, "learning_rate": 1.0814606741573033e-07, "logps/chosen": -28.060367584228516, "logps/rejected": -35.89076232910156, "loss": 0.5957, "losses/dpo": 0.6158527135848999, "losses/sft": 0.9492118954658508, "losses/total": 0.6158527135848999, "ref_logps/chosen": -22.560768127441406, "ref_logps/rejected": -26.671247482299805, "rewards/accuracies": 0.7265625, "rewards/chosen": -0.5499600172042847, "rewards/margins": 0.3719918131828308, "rewards/rejected": -0.9219518899917603, "step": 319 }, { "epoch": 2.42, "learning_rate": 1.0674157303370785e-07, "logps/chosen": -28.077362060546875, "logps/rejected": -32.179466247558594, "loss": 0.579, "losses/dpo": 0.5990191698074341, "losses/sft": 1.0173970460891724, "losses/total": 0.5990191698074341, "ref_logps/chosen": -23.08903694152832, "ref_logps/rejected": -23.298954010009766, "rewards/accuracies": 0.6796875, "rewards/chosen": -0.4988323450088501, "rewards/margins": 0.38921868801116943, "rewards/rejected": -0.8880510330200195, "step": 320 }, { "epoch": 2.42, "learning_rate": 1.0533707865168538e-07, "logps/chosen": -28.690523147583008, "logps/rejected": -34.72178268432617, "loss": 0.5694, "losses/dpo": 0.6682005524635315, "losses/sft": 0.9579723477363586, "losses/total": 0.6682005524635315, "ref_logps/chosen": -23.495473861694336, "ref_logps/rejected": -25.300090789794922, "rewards/accuracies": 0.7265625, "rewards/chosen": -0.519504964351654, "rewards/margins": 0.42266416549682617, "rewards/rejected": -0.942169189453125, "step": 321 }, { "epoch": 2.43, "learning_rate": 1.0393258426966293e-07, "logps/chosen": -25.57655143737793, "logps/rejected": -33.412723541259766, "loss": 0.5471, "losses/dpo": 0.6572707891464233, "losses/sft": 1.028795599937439, "losses/total": 0.6572707891464233, "ref_logps/chosen": -21.189647674560547, "ref_logps/rejected": -24.523921966552734, "rewards/accuracies": 0.75, "rewards/chosen": -0.4386903643608093, "rewards/margins": 0.45018988847732544, "rewards/rejected": -0.8888803124427795, "step": 322 }, { "epoch": 2.44, "learning_rate": 1.0252808988764044e-07, "logps/chosen": -29.36073112487793, "logps/rejected": -33.21003723144531, "loss": 0.5813, "losses/dpo": 0.5923129916191101, "losses/sft": 0.9457908272743225, "losses/total": 0.5923129916191101, "ref_logps/chosen": -24.347349166870117, "ref_logps/rejected": -24.210060119628906, "rewards/accuracies": 0.7265625, "rewards/chosen": -0.5013381838798523, "rewards/margins": 0.3986593186855316, "rewards/rejected": -0.8999974727630615, "step": 323 }, { "epoch": 2.45, "learning_rate": 1.0112359550561797e-07, "logps/chosen": -25.595857620239258, "logps/rejected": -36.034000396728516, "loss": 0.5458, "losses/dpo": 0.5657480359077454, "losses/sft": 0.9646883606910706, "losses/total": 0.5657480359077454, "ref_logps/chosen": -21.133930206298828, "ref_logps/rejected": -26.436534881591797, "rewards/accuracies": 0.7265625, "rewards/chosen": -0.44619300961494446, "rewards/margins": 0.5135533809661865, "rewards/rejected": -0.9597463607788086, "step": 324 }, { "epoch": 2.45, "learning_rate": 9.971910112359549e-08, "logps/chosen": -25.68283462524414, "logps/rejected": -35.9984130859375, "loss": 0.5137, "losses/dpo": 0.5698142051696777, "losses/sft": 0.9736462235450745, "losses/total": 0.5698142051696777, "ref_logps/chosen": -21.976032257080078, "ref_logps/rejected": -26.659069061279297, "rewards/accuracies": 0.796875, "rewards/chosen": -0.370680034160614, "rewards/margins": 0.5632542371749878, "rewards/rejected": -0.9339342713356018, "step": 325 }, { "epoch": 2.46, "learning_rate": 9.831460674157303e-08, "logps/chosen": -27.503202438354492, "logps/rejected": -37.614471435546875, "loss": 0.5574, "losses/dpo": 0.6440725326538086, "losses/sft": 0.963034987449646, "losses/total": 0.6440725326538086, "ref_logps/chosen": -22.742584228515625, "ref_logps/rejected": -28.33091163635254, "rewards/accuracies": 0.703125, "rewards/chosen": -0.47606179118156433, "rewards/margins": 0.45229417085647583, "rewards/rejected": -0.9283559918403625, "step": 326 }, { "epoch": 2.47, "learning_rate": 9.691011235955055e-08, "logps/chosen": -27.817134857177734, "logps/rejected": -34.62038803100586, "loss": 0.5561, "losses/dpo": 0.6037241816520691, "losses/sft": 0.9915317296981812, "losses/total": 0.6037241816520691, "ref_logps/chosen": -23.01769256591797, "ref_logps/rejected": -25.366844177246094, "rewards/accuracies": 0.7421875, "rewards/chosen": -0.47994428873062134, "rewards/margins": 0.44541001319885254, "rewards/rejected": -0.9253543019294739, "step": 327 }, { "epoch": 2.48, "learning_rate": 9.550561797752808e-08, "logps/chosen": -29.053791046142578, "logps/rejected": -36.763641357421875, "loss": 0.5811, "losses/dpo": 0.5880983471870422, "losses/sft": 1.1540213823318481, "losses/total": 0.5880983471870422, "ref_logps/chosen": -23.688079833984375, "ref_logps/rejected": -27.189102172851562, "rewards/accuracies": 0.6796875, "rewards/chosen": -0.5365712642669678, "rewards/margins": 0.42088285088539124, "rewards/rejected": -0.9574541449546814, "step": 328 }, { "epoch": 2.48, "learning_rate": 9.410112359550561e-08, "logps/chosen": -25.49301528930664, "logps/rejected": -34.453372955322266, "loss": 0.56, "losses/dpo": 0.4375653862953186, "losses/sft": 1.0353739261627197, "losses/total": 0.4375653862953186, "ref_logps/chosen": -21.16693115234375, "ref_logps/rejected": -25.54058837890625, "rewards/accuracies": 0.7265625, "rewards/chosen": -0.43260854482650757, "rewards/margins": 0.45866984128952026, "rewards/rejected": -0.8912783861160278, "step": 329 }, { "epoch": 2.49, "learning_rate": 9.269662921348314e-08, "logps/chosen": -27.841381072998047, "logps/rejected": -31.897401809692383, "loss": 0.5787, "losses/dpo": 0.6422601938247681, "losses/sft": 0.9122541546821594, "losses/total": 0.6422601938247681, "ref_logps/chosen": -23.25330352783203, "ref_logps/rejected": -23.617647171020508, "rewards/accuracies": 0.7109375, "rewards/chosen": -0.45880773663520813, "rewards/margins": 0.3691678047180176, "rewards/rejected": -0.8279755115509033, "step": 330 }, { "epoch": 2.5, "learning_rate": 9.129213483146067e-08, "logps/chosen": -28.4913330078125, "logps/rejected": -37.258323669433594, "loss": 0.5962, "losses/dpo": 0.7063708901405334, "losses/sft": 1.0378127098083496, "losses/total": 0.7063708901405334, "ref_logps/chosen": -22.901752471923828, "ref_logps/rejected": -27.694263458251953, "rewards/accuracies": 0.71875, "rewards/chosen": -0.558958113193512, "rewards/margins": 0.3974474370479584, "rewards/rejected": -0.956405520439148, "step": 331 }, { "epoch": 2.51, "learning_rate": 8.988764044943819e-08, "logps/chosen": -28.342041015625, "logps/rejected": -36.76737976074219, "loss": 0.5382, "losses/dpo": 0.6065940856933594, "losses/sft": 1.061606526374817, "losses/total": 0.6065940856933594, "ref_logps/chosen": -22.77169418334961, "ref_logps/rejected": -26.355728149414062, "rewards/accuracies": 0.7578125, "rewards/chosen": -0.5570348501205444, "rewards/margins": 0.48413002490997314, "rewards/rejected": -1.0411648750305176, "step": 332 }, { "epoch": 2.51, "learning_rate": 8.848314606741572e-08, "logps/chosen": -28.1925048828125, "logps/rejected": -36.757789611816406, "loss": 0.542, "losses/dpo": 0.5146865844726562, "losses/sft": 0.82643723487854, "losses/total": 0.5146865844726562, "ref_logps/chosen": -22.9779052734375, "ref_logps/rejected": -26.85318374633789, "rewards/accuracies": 0.75, "rewards/chosen": -0.5214601755142212, "rewards/margins": 0.4690002501010895, "rewards/rejected": -0.9904604554176331, "step": 333 }, { "epoch": 2.52, "learning_rate": 8.707865168539325e-08, "logps/chosen": -26.62852668762207, "logps/rejected": -34.3861083984375, "loss": 0.5288, "losses/dpo": 0.554874062538147, "losses/sft": 0.9589724540710449, "losses/total": 0.554874062538147, "ref_logps/chosen": -22.37548065185547, "ref_logps/rejected": -25.256423950195312, "rewards/accuracies": 0.7578125, "rewards/chosen": -0.4253048598766327, "rewards/margins": 0.4876634180545807, "rewards/rejected": -0.9129682183265686, "step": 334 }, { "epoch": 2.53, "learning_rate": 8.567415730337078e-08, "logps/chosen": -30.657304763793945, "logps/rejected": -35.13941192626953, "loss": 0.564, "losses/dpo": 0.5628042817115784, "losses/sft": 0.9816582202911377, "losses/total": 0.5628042817115784, "ref_logps/chosen": -25.568553924560547, "ref_logps/rejected": -26.13833236694336, "rewards/accuracies": 0.765625, "rewards/chosen": -0.5088753700256348, "rewards/margins": 0.3912326395511627, "rewards/rejected": -0.9001079797744751, "step": 335 }, { "epoch": 2.54, "learning_rate": 8.426966292134831e-08, "logps/chosen": -27.922801971435547, "logps/rejected": -35.52781677246094, "loss": 0.5751, "losses/dpo": 0.5604207515716553, "losses/sft": 0.9308174848556519, "losses/total": 0.5604207515716553, "ref_logps/chosen": -22.82427978515625, "ref_logps/rejected": -26.594511032104492, "rewards/accuracies": 0.7578125, "rewards/chosen": -0.5098517537117004, "rewards/margins": 0.38347893953323364, "rewards/rejected": -0.8933306932449341, "step": 336 }, { "epoch": 2.54, "learning_rate": 8.286516853932583e-08, "logps/chosen": -26.58924674987793, "logps/rejected": -37.19468688964844, "loss": 0.5092, "losses/dpo": 0.5189211368560791, "losses/sft": 0.9888613224029541, "losses/total": 0.5189211368560791, "ref_logps/chosen": -22.07261085510254, "ref_logps/rejected": -26.852371215820312, "rewards/accuracies": 0.75, "rewards/chosen": -0.45166367292404175, "rewards/margins": 0.5825679302215576, "rewards/rejected": -1.0342316627502441, "step": 337 }, { "epoch": 2.55, "learning_rate": 8.146067415730337e-08, "logps/chosen": -26.25957489013672, "logps/rejected": -34.435699462890625, "loss": 0.5636, "losses/dpo": 0.5064201951026917, "losses/sft": 1.0053503513336182, "losses/total": 0.5064201951026917, "ref_logps/chosen": -21.541664123535156, "ref_logps/rejected": -25.595813751220703, "rewards/accuracies": 0.7265625, "rewards/chosen": -0.47179120779037476, "rewards/margins": 0.41219767928123474, "rewards/rejected": -0.8839888572692871, "step": 338 }, { "epoch": 2.56, "learning_rate": 8.005617977528089e-08, "logps/chosen": -28.839031219482422, "logps/rejected": -37.7884635925293, "loss": 0.5099, "losses/dpo": 0.5300096273422241, "losses/sft": 0.9853606224060059, "losses/total": 0.5300096273422241, "ref_logps/chosen": -24.249427795410156, "ref_logps/rejected": -27.3421688079834, "rewards/accuracies": 0.78125, "rewards/chosen": -0.4589604139328003, "rewards/margins": 0.5856689810752869, "rewards/rejected": -1.0446293354034424, "step": 339 }, { "epoch": 2.57, "learning_rate": 7.865168539325842e-08, "logps/chosen": -24.501375198364258, "logps/rejected": -35.306766510009766, "loss": 0.5489, "losses/dpo": 0.5961363315582275, "losses/sft": 1.0056707859039307, "losses/total": 0.5961363315582275, "ref_logps/chosen": -19.69840431213379, "ref_logps/rejected": -26.113300323486328, "rewards/accuracies": 0.75, "rewards/chosen": -0.4802970886230469, "rewards/margins": 0.43904954195022583, "rewards/rejected": -0.9193466305732727, "step": 340 }, { "epoch": 2.57, "learning_rate": 7.724719101123594e-08, "logps/chosen": -28.589330673217773, "logps/rejected": -37.83360290527344, "loss": 0.5047, "losses/dpo": 0.584464430809021, "losses/sft": 1.1327065229415894, "losses/total": 0.584464430809021, "ref_logps/chosen": -23.609786987304688, "ref_logps/rejected": -27.048458099365234, "rewards/accuracies": 0.7734375, "rewards/chosen": -0.49795451760292053, "rewards/margins": 0.5805596113204956, "rewards/rejected": -1.0785142183303833, "step": 341 }, { "epoch": 2.58, "learning_rate": 7.584269662921348e-08, "logps/chosen": -26.895156860351562, "logps/rejected": -36.51441192626953, "loss": 0.5664, "losses/dpo": 0.5289937257766724, "losses/sft": 0.9366389513015747, "losses/total": 0.5289937257766724, "ref_logps/chosen": -21.687744140625, "ref_logps/rejected": -27.214290618896484, "rewards/accuracies": 0.6953125, "rewards/chosen": -0.5207412838935852, "rewards/margins": 0.40927091240882874, "rewards/rejected": -0.9300122261047363, "step": 342 }, { "epoch": 2.59, "learning_rate": 7.443820224719101e-08, "logps/chosen": -26.770729064941406, "logps/rejected": -37.010292053222656, "loss": 0.5614, "losses/dpo": 0.5385686159133911, "losses/sft": 1.026196002960205, "losses/total": 0.5385686159133911, "ref_logps/chosen": -21.59899139404297, "ref_logps/rejected": -27.5603084564209, "rewards/accuracies": 0.71875, "rewards/chosen": -0.5171737670898438, "rewards/margins": 0.4278249144554138, "rewards/rejected": -0.9449986815452576, "step": 343 }, { "epoch": 2.6, "learning_rate": 7.303370786516853e-08, "logps/chosen": -28.282766342163086, "logps/rejected": -37.94123840332031, "loss": 0.5574, "losses/dpo": 0.4933924973011017, "losses/sft": 1.0346543788909912, "losses/total": 0.4933924973011017, "ref_logps/chosen": -23.10584831237793, "ref_logps/rejected": -27.940820693969727, "rewards/accuracies": 0.7265625, "rewards/chosen": -0.5176920294761658, "rewards/margins": 0.48235008120536804, "rewards/rejected": -1.0000420808792114, "step": 344 }, { "epoch": 2.6, "learning_rate": 7.162921348314606e-08, "logps/chosen": -28.72567367553711, "logps/rejected": -39.72615432739258, "loss": 0.5095, "losses/dpo": 0.46729788184165955, "losses/sft": 1.0185084342956543, "losses/total": 0.46729788184165955, "ref_logps/chosen": -23.77487564086914, "ref_logps/rejected": -29.071794509887695, "rewards/accuracies": 0.8046875, "rewards/chosen": -0.4950796663761139, "rewards/margins": 0.5703563690185547, "rewards/rejected": -1.0654358863830566, "step": 345 }, { "epoch": 2.61, "learning_rate": 7.022471910112359e-08, "logps/chosen": -25.868453979492188, "logps/rejected": -38.04795455932617, "loss": 0.5312, "losses/dpo": 0.5006756782531738, "losses/sft": 0.9233719110488892, "losses/total": 0.5006756782531738, "ref_logps/chosen": -21.202287673950195, "ref_logps/rejected": -28.387357711791992, "rewards/accuracies": 0.78125, "rewards/chosen": -0.4666164517402649, "rewards/margins": 0.49944305419921875, "rewards/rejected": -0.9660595059394836, "step": 346 }, { "epoch": 2.62, "learning_rate": 6.882022471910112e-08, "logps/chosen": -30.674976348876953, "logps/rejected": -37.18801498413086, "loss": 0.5609, "losses/dpo": 0.53383469581604, "losses/sft": 1.0966167449951172, "losses/total": 0.53383469581604, "ref_logps/chosen": -25.279560089111328, "ref_logps/rejected": -27.363300323486328, "rewards/accuracies": 0.734375, "rewards/chosen": -0.5395419001579285, "rewards/margins": 0.4429297149181366, "rewards/rejected": -0.9824715852737427, "step": 347 }, { "epoch": 2.63, "learning_rate": 6.741573033707864e-08, "logps/chosen": -27.533077239990234, "logps/rejected": -36.4505729675293, "loss": 0.5459, "losses/dpo": 0.5303448438644409, "losses/sft": 1.0059340000152588, "losses/total": 0.5303448438644409, "ref_logps/chosen": -23.062650680541992, "ref_logps/rejected": -27.07624053955078, "rewards/accuracies": 0.734375, "rewards/chosen": -0.4470424950122833, "rewards/margins": 0.49039074778556824, "rewards/rejected": -0.9374332427978516, "step": 348 }, { "epoch": 2.63, "learning_rate": 6.601123595505617e-08, "logps/chosen": -27.371315002441406, "logps/rejected": -37.460845947265625, "loss": 0.5395, "losses/dpo": 0.46134790778160095, "losses/sft": 1.0326218605041504, "losses/total": 0.46134790778160095, "ref_logps/chosen": -21.61273765563965, "ref_logps/rejected": -26.598316192626953, "rewards/accuracies": 0.7265625, "rewards/chosen": -0.5758577585220337, "rewards/margins": 0.5103954076766968, "rewards/rejected": -1.0862531661987305, "step": 349 }, { "epoch": 2.64, "learning_rate": 6.460674157303371e-08, "logps/chosen": -26.683940887451172, "logps/rejected": -35.51959228515625, "loss": 0.531, "losses/dpo": 0.5929858684539795, "losses/sft": 0.8796969056129456, "losses/total": 0.5929858684539795, "ref_logps/chosen": -21.84777069091797, "ref_logps/rejected": -25.34250259399414, "rewards/accuracies": 0.7578125, "rewards/chosen": -0.48361673951148987, "rewards/margins": 0.5340923070907593, "rewards/rejected": -1.0177090167999268, "step": 350 }, { "epoch": 2.65, "learning_rate": 6.320224719101123e-08, "logps/chosen": -27.784767150878906, "logps/rejected": -36.31642150878906, "loss": 0.5638, "losses/dpo": 0.46039754152297974, "losses/sft": 1.014696478843689, "losses/total": 0.46039754152297974, "ref_logps/chosen": -22.830692291259766, "ref_logps/rejected": -26.99026107788086, "rewards/accuracies": 0.6484375, "rewards/chosen": -0.4954075217247009, "rewards/margins": 0.4372091293334961, "rewards/rejected": -0.9326165914535522, "step": 351 }, { "epoch": 2.66, "learning_rate": 6.179775280898876e-08, "logps/chosen": -23.8892765045166, "logps/rejected": -32.802425384521484, "loss": 0.5307, "losses/dpo": 0.5120245218276978, "losses/sft": 0.9590541124343872, "losses/total": 0.5120245218276978, "ref_logps/chosen": -19.922191619873047, "ref_logps/rejected": -23.754770278930664, "rewards/accuracies": 0.7890625, "rewards/chosen": -0.3967083692550659, "rewards/margins": 0.5080575346946716, "rewards/rejected": -0.9047658443450928, "step": 352 }, { "epoch": 2.66, "learning_rate": 6.039325842696629e-08, "logps/chosen": -27.902587890625, "logps/rejected": -39.759193420410156, "loss": 0.5216, "losses/dpo": 0.5157948136329651, "losses/sft": 0.8797988891601562, "losses/total": 0.5157948136329651, "ref_logps/chosen": -22.233232498168945, "ref_logps/rejected": -28.421966552734375, "rewards/accuracies": 0.796875, "rewards/chosen": -0.5669355988502502, "rewards/margins": 0.5667868852615356, "rewards/rejected": -1.1337225437164307, "step": 353 }, { "epoch": 2.67, "learning_rate": 5.898876404494382e-08, "logps/chosen": -27.200105667114258, "logps/rejected": -38.05504608154297, "loss": 0.5154, "losses/dpo": 0.6272658705711365, "losses/sft": 0.901512086391449, "losses/total": 0.6272658705711365, "ref_logps/chosen": -22.22509765625, "ref_logps/rejected": -27.425247192382812, "rewards/accuracies": 0.7421875, "rewards/chosen": -0.49750083684921265, "rewards/margins": 0.5654786825180054, "rewards/rejected": -1.0629794597625732, "step": 354 }, { "epoch": 2.68, "learning_rate": 5.758426966292135e-08, "logps/chosen": -29.658336639404297, "logps/rejected": -38.597557067871094, "loss": 0.5507, "losses/dpo": 0.4642670750617981, "losses/sft": 1.0486382246017456, "losses/total": 0.4642670750617981, "ref_logps/chosen": -23.814481735229492, "ref_logps/rejected": -27.686166763305664, "rewards/accuracies": 0.78125, "rewards/chosen": -0.5843857526779175, "rewards/margins": 0.5067534446716309, "rewards/rejected": -1.0911391973495483, "step": 355 }, { "epoch": 2.69, "learning_rate": 5.617977528089887e-08, "logps/chosen": -23.939620971679688, "logps/rejected": -33.06968688964844, "loss": 0.5631, "losses/dpo": 0.5865851640701294, "losses/sft": 1.1602400541305542, "losses/total": 0.5865851640701294, "ref_logps/chosen": -19.01073455810547, "ref_logps/rejected": -23.945262908935547, "rewards/accuracies": 0.7265625, "rewards/chosen": -0.4928884506225586, "rewards/margins": 0.4195541441440582, "rewards/rejected": -0.9124425649642944, "step": 356 }, { "epoch": 2.69, "learning_rate": 5.47752808988764e-08, "logps/chosen": -27.94991683959961, "logps/rejected": -36.65930938720703, "loss": 0.5461, "losses/dpo": 0.6400465369224548, "losses/sft": 1.0134565830230713, "losses/total": 0.6400465369224548, "ref_logps/chosen": -23.591896057128906, "ref_logps/rejected": -27.34914779663086, "rewards/accuracies": 0.7578125, "rewards/chosen": -0.43580204248428345, "rewards/margins": 0.4952143728733063, "rewards/rejected": -0.9310164451599121, "step": 357 }, { "epoch": 2.7, "learning_rate": 5.3370786516853926e-08, "logps/chosen": -28.148937225341797, "logps/rejected": -34.08583450317383, "loss": 0.561, "losses/dpo": 0.47015029191970825, "losses/sft": 0.923213005065918, "losses/total": 0.47015029191970825, "ref_logps/chosen": -23.29110336303711, "ref_logps/rejected": -25.051483154296875, "rewards/accuracies": 0.703125, "rewards/chosen": -0.4857832193374634, "rewards/margins": 0.417651891708374, "rewards/rejected": -0.9034351110458374, "step": 358 }, { "epoch": 2.71, "learning_rate": 5.196629213483146e-08, "logps/chosen": -26.931848526000977, "logps/rejected": -35.78190994262695, "loss": 0.5196, "losses/dpo": 0.4919354021549225, "losses/sft": 0.9875601530075073, "losses/total": 0.4919354021549225, "ref_logps/chosen": -22.220352172851562, "ref_logps/rejected": -25.88389778137207, "rewards/accuracies": 0.7421875, "rewards/chosen": -0.4711495637893677, "rewards/margins": 0.51865154504776, "rewards/rejected": -0.9898011684417725, "step": 359 }, { "epoch": 2.72, "learning_rate": 5.056179775280899e-08, "logps/chosen": -26.860830307006836, "logps/rejected": -36.244728088378906, "loss": 0.5141, "losses/dpo": 0.5143895745277405, "losses/sft": 0.8888437747955322, "losses/total": 0.5143895745277405, "ref_logps/chosen": -22.12276840209961, "ref_logps/rejected": -25.870086669921875, "rewards/accuracies": 0.796875, "rewards/chosen": -0.4738062918186188, "rewards/margins": 0.5636579394340515, "rewards/rejected": -1.0374642610549927, "step": 360 }, { "epoch": 2.72, "learning_rate": 4.915730337078652e-08, "logps/chosen": -27.645713806152344, "logps/rejected": -35.34681701660156, "loss": 0.5612, "losses/dpo": 0.5186240077018738, "losses/sft": 1.109127402305603, "losses/total": 0.5186240077018738, "ref_logps/chosen": -22.531267166137695, "ref_logps/rejected": -25.30887222290039, "rewards/accuracies": 0.6875, "rewards/chosen": -0.5114448070526123, "rewards/margins": 0.4923498034477234, "rewards/rejected": -1.0037946701049805, "step": 361 }, { "epoch": 2.73, "learning_rate": 4.775280898876404e-08, "logps/chosen": -29.528343200683594, "logps/rejected": -37.111507415771484, "loss": 0.5701, "losses/dpo": 0.5167029500007629, "losses/sft": 1.1346383094787598, "losses/total": 0.5167029500007629, "ref_logps/chosen": -23.850698471069336, "ref_logps/rejected": -27.338424682617188, "rewards/accuracies": 0.703125, "rewards/chosen": -0.5677646398544312, "rewards/margins": 0.40954357385635376, "rewards/rejected": -0.9773082137107849, "step": 362 }, { "epoch": 2.74, "learning_rate": 4.634831460674157e-08, "logps/chosen": -27.099462509155273, "logps/rejected": -38.734046936035156, "loss": 0.5367, "losses/dpo": 0.6075611114501953, "losses/sft": 1.0922847986221313, "losses/total": 0.6075611114501953, "ref_logps/chosen": -21.647756576538086, "ref_logps/rejected": -27.990768432617188, "rewards/accuracies": 0.7265625, "rewards/chosen": -0.5451704859733582, "rewards/margins": 0.5291576385498047, "rewards/rejected": -1.0743281841278076, "step": 363 }, { "epoch": 2.75, "learning_rate": 4.4943820224719096e-08, "logps/chosen": -28.804433822631836, "logps/rejected": -38.87983703613281, "loss": 0.5448, "losses/dpo": 0.5679644346237183, "losses/sft": 1.123085618019104, "losses/total": 0.5679644346237183, "ref_logps/chosen": -23.055761337280273, "ref_logps/rejected": -27.842578887939453, "rewards/accuracies": 0.75, "rewards/chosen": -0.5748672485351562, "rewards/margins": 0.5288586020469666, "rewards/rejected": -1.1037259101867676, "step": 364 }, { "epoch": 2.75, "learning_rate": 4.3539325842696626e-08, "logps/chosen": -29.942031860351562, "logps/rejected": -37.742164611816406, "loss": 0.5544, "losses/dpo": 0.4389882981777191, "losses/sft": 0.9757397174835205, "losses/total": 0.4389882981777191, "ref_logps/chosen": -24.796215057373047, "ref_logps/rejected": -27.602325439453125, "rewards/accuracies": 0.6640625, "rewards/chosen": -0.5145817995071411, "rewards/margins": 0.4994018077850342, "rewards/rejected": -1.0139836072921753, "step": 365 }, { "epoch": 2.76, "learning_rate": 4.213483146067416e-08, "logps/chosen": -30.154991149902344, "logps/rejected": -35.81608581542969, "loss": 0.57, "losses/dpo": 0.571212887763977, "losses/sft": 0.8268208503723145, "losses/total": 0.571212887763977, "ref_logps/chosen": -24.421096801757812, "ref_logps/rejected": -25.88280487060547, "rewards/accuracies": 0.75, "rewards/chosen": -0.5733895897865295, "rewards/margins": 0.4199383854866028, "rewards/rejected": -0.9933279752731323, "step": 366 }, { "epoch": 2.77, "learning_rate": 4.073033707865169e-08, "logps/chosen": -27.25971794128418, "logps/rejected": -33.205955505371094, "loss": 0.5874, "losses/dpo": 0.4875527620315552, "losses/sft": 0.8703315854072571, "losses/total": 0.4875527620315552, "ref_logps/chosen": -22.749954223632812, "ref_logps/rejected": -24.781803131103516, "rewards/accuracies": 0.734375, "rewards/chosen": -0.45097634196281433, "rewards/margins": 0.3914392292499542, "rewards/rejected": -0.8424156308174133, "step": 367 }, { "epoch": 2.78, "learning_rate": 3.932584269662921e-08, "logps/chosen": -28.230928421020508, "logps/rejected": -37.86750030517578, "loss": 0.508, "losses/dpo": 0.4668968617916107, "losses/sft": 1.1078698635101318, "losses/total": 0.4668968617916107, "ref_logps/chosen": -23.454715728759766, "ref_logps/rejected": -27.212678909301758, "rewards/accuracies": 0.765625, "rewards/chosen": -0.47762107849121094, "rewards/margins": 0.5878612995147705, "rewards/rejected": -1.0654823780059814, "step": 368 }, { "epoch": 2.78, "learning_rate": 3.792134831460674e-08, "logps/chosen": -28.5417423248291, "logps/rejected": -39.07720184326172, "loss": 0.5722, "losses/dpo": 0.5119404196739197, "losses/sft": 1.0701940059661865, "losses/total": 0.5119404196739197, "ref_logps/chosen": -22.97249984741211, "ref_logps/rejected": -29.236312866210938, "rewards/accuracies": 0.6875, "rewards/chosen": -0.5569244623184204, "rewards/margins": 0.42716455459594727, "rewards/rejected": -0.9840888977050781, "step": 369 }, { "epoch": 2.79, "learning_rate": 3.6516853932584266e-08, "logps/chosen": -24.37343406677246, "logps/rejected": -35.577354431152344, "loss": 0.5144, "losses/dpo": 0.39502987265586853, "losses/sft": 1.0756311416625977, "losses/total": 0.39502987265586853, "ref_logps/chosen": -19.630115509033203, "ref_logps/rejected": -25.026439666748047, "rewards/accuracies": 0.765625, "rewards/chosen": -0.474331796169281, "rewards/margins": 0.5807597041130066, "rewards/rejected": -1.055091381072998, "step": 370 }, { "epoch": 2.8, "learning_rate": 3.5112359550561796e-08, "logps/chosen": -25.75430679321289, "logps/rejected": -35.49622344970703, "loss": 0.5757, "losses/dpo": 0.5865879058837891, "losses/sft": 1.0159986019134521, "losses/total": 0.5865879058837891, "ref_logps/chosen": -21.11154556274414, "ref_logps/rejected": -26.56639862060547, "rewards/accuracies": 0.6875, "rewards/chosen": -0.4642760157585144, "rewards/margins": 0.42870670557022095, "rewards/rejected": -0.8929827213287354, "step": 371 }, { "epoch": 2.81, "learning_rate": 3.370786516853932e-08, "logps/chosen": -28.015663146972656, "logps/rejected": -36.934810638427734, "loss": 0.507, "losses/dpo": 0.6495200395584106, "losses/sft": 1.097916841506958, "losses/total": 0.6495200395584106, "ref_logps/chosen": -23.48037338256836, "ref_logps/rejected": -26.453636169433594, "rewards/accuracies": 0.7734375, "rewards/chosen": -0.45352903008461, "rewards/margins": 0.5945882797241211, "rewards/rejected": -1.0481172800064087, "step": 372 }, { "epoch": 2.82, "learning_rate": 3.230337078651686e-08, "logps/chosen": -29.112939834594727, "logps/rejected": -35.190895080566406, "loss": 0.5557, "losses/dpo": 0.3998969793319702, "losses/sft": 0.9329382181167603, "losses/total": 0.3998969793319702, "ref_logps/chosen": -23.576570510864258, "ref_logps/rejected": -24.735258102416992, "rewards/accuracies": 0.75, "rewards/chosen": -0.5536371469497681, "rewards/margins": 0.49192649126052856, "rewards/rejected": -1.0455635786056519, "step": 373 }, { "epoch": 2.82, "learning_rate": 3.089887640449438e-08, "logps/chosen": -28.713830947875977, "logps/rejected": -37.038963317871094, "loss": 0.5536, "losses/dpo": 0.548796534538269, "losses/sft": 1.0410091876983643, "losses/total": 0.548796534538269, "ref_logps/chosen": -23.76250457763672, "ref_logps/rejected": -27.5520076751709, "rewards/accuracies": 0.6875, "rewards/chosen": -0.49513280391693115, "rewards/margins": 0.45356276631355286, "rewards/rejected": -0.9486956000328064, "step": 374 }, { "epoch": 2.83, "learning_rate": 2.949438202247191e-08, "logps/chosen": -29.465068817138672, "logps/rejected": -39.406578063964844, "loss": 0.5343, "losses/dpo": 0.6983579397201538, "losses/sft": 1.0986469984054565, "losses/total": 0.6983579397201538, "ref_logps/chosen": -24.418071746826172, "ref_logps/rejected": -28.89803695678711, "rewards/accuracies": 0.75, "rewards/chosen": -0.5046992897987366, "rewards/margins": 0.5461547374725342, "rewards/rejected": -1.050853967666626, "step": 375 }, { "epoch": 2.84, "learning_rate": 2.8089887640449436e-08, "logps/chosen": -27.72464370727539, "logps/rejected": -35.225887298583984, "loss": 0.5838, "losses/dpo": 0.615436851978302, "losses/sft": 1.064025640487671, "losses/total": 0.615436851978302, "ref_logps/chosen": -22.502582550048828, "ref_logps/rejected": -26.32878875732422, "rewards/accuracies": 0.7265625, "rewards/chosen": -0.5222063660621643, "rewards/margins": 0.36750373244285583, "rewards/rejected": -0.8897100687026978, "step": 376 }, { "epoch": 2.85, "learning_rate": 2.6685393258426963e-08, "logps/chosen": -26.368640899658203, "logps/rejected": -35.305564880371094, "loss": 0.5351, "losses/dpo": 0.5552591681480408, "losses/sft": 0.8796924352645874, "losses/total": 0.5552591681480408, "ref_logps/chosen": -21.371036529541016, "ref_logps/rejected": -25.149438858032227, "rewards/accuracies": 0.71875, "rewards/chosen": -0.49976038932800293, "rewards/margins": 0.5158523917198181, "rewards/rejected": -1.0156128406524658, "step": 377 }, { "epoch": 2.85, "learning_rate": 2.5280898876404493e-08, "logps/chosen": -29.838565826416016, "logps/rejected": -38.05325698852539, "loss": 0.5338, "losses/dpo": 0.4755927324295044, "losses/sft": 0.9763241410255432, "losses/total": 0.4755927324295044, "ref_logps/chosen": -24.918655395507812, "ref_logps/rejected": -28.04724884033203, "rewards/accuracies": 0.734375, "rewards/chosen": -0.4919911026954651, "rewards/margins": 0.5086094737052917, "rewards/rejected": -1.0006005764007568, "step": 378 }, { "epoch": 2.86, "learning_rate": 2.387640449438202e-08, "logps/chosen": -29.036991119384766, "logps/rejected": -35.4906005859375, "loss": 0.5335, "losses/dpo": 0.5385127067565918, "losses/sft": 1.245056390762329, "losses/total": 0.5385127067565918, "ref_logps/chosen": -23.929513931274414, "ref_logps/rejected": -24.878217697143555, "rewards/accuracies": 0.7421875, "rewards/chosen": -0.5107479095458984, "rewards/margins": 0.5504903793334961, "rewards/rejected": -1.0612382888793945, "step": 379 }, { "epoch": 2.87, "learning_rate": 2.2471910112359548e-08, "logps/chosen": -29.392702102661133, "logps/rejected": -38.68418884277344, "loss": 0.545, "losses/dpo": 0.43943360447883606, "losses/sft": 1.023887276649475, "losses/total": 0.43943360447883606, "ref_logps/chosen": -24.07533073425293, "ref_logps/rejected": -28.427806854248047, "rewards/accuracies": 0.75, "rewards/chosen": -0.5317370891571045, "rewards/margins": 0.49390077590942383, "rewards/rejected": -1.0256378650665283, "step": 380 }, { "epoch": 2.88, "learning_rate": 2.106741573033708e-08, "logps/chosen": -25.038589477539062, "logps/rejected": -32.384376525878906, "loss": 0.56, "losses/dpo": 0.6935892701148987, "losses/sft": 1.0011663436889648, "losses/total": 0.6935892701148987, "ref_logps/chosen": -21.044326782226562, "ref_logps/rejected": -23.86334991455078, "rewards/accuracies": 0.71875, "rewards/chosen": -0.3994261920452118, "rewards/margins": 0.45267629623413086, "rewards/rejected": -0.8521024584770203, "step": 381 }, { "epoch": 2.88, "learning_rate": 1.9662921348314606e-08, "logps/chosen": -30.07229995727539, "logps/rejected": -37.0654411315918, "loss": 0.5936, "losses/dpo": 0.547340989112854, "losses/sft": 1.0020110607147217, "losses/total": 0.547340989112854, "ref_logps/chosen": -23.930465698242188, "ref_logps/rejected": -27.052518844604492, "rewards/accuracies": 0.6171875, "rewards/chosen": -0.6141834259033203, "rewards/margins": 0.38710883259773254, "rewards/rejected": -1.0012922286987305, "step": 382 }, { "epoch": 2.89, "learning_rate": 1.8258426966292133e-08, "logps/chosen": -30.305606842041016, "logps/rejected": -40.710792541503906, "loss": 0.537, "losses/dpo": 0.5175353288650513, "losses/sft": 0.8916615843772888, "losses/total": 0.5175353288650513, "ref_logps/chosen": -25.279661178588867, "ref_logps/rejected": -29.970672607421875, "rewards/accuracies": 0.6796875, "rewards/chosen": -0.5025948286056519, "rewards/margins": 0.5714170932769775, "rewards/rejected": -1.0740119218826294, "step": 383 }, { "epoch": 2.9, "learning_rate": 1.685393258426966e-08, "logps/chosen": -29.87887191772461, "logps/rejected": -39.89691162109375, "loss": 0.5598, "losses/dpo": 0.4781198799610138, "losses/sft": 1.0425841808319092, "losses/total": 0.4781198799610138, "ref_logps/chosen": -23.869295120239258, "ref_logps/rejected": -29.154647827148438, "rewards/accuracies": 0.7109375, "rewards/chosen": -0.6009576916694641, "rewards/margins": 0.4732685387134552, "rewards/rejected": -1.0742262601852417, "step": 384 }, { "epoch": 2.91, "learning_rate": 1.544943820224719e-08, "logps/chosen": -26.600048065185547, "logps/rejected": -39.657188415527344, "loss": 0.5186, "losses/dpo": 0.5135948657989502, "losses/sft": 0.9224843978881836, "losses/total": 0.5135948657989502, "ref_logps/chosen": -21.754756927490234, "ref_logps/rejected": -29.07170867919922, "rewards/accuracies": 0.75, "rewards/chosen": -0.48452913761138916, "rewards/margins": 0.5740190744400024, "rewards/rejected": -1.0585482120513916, "step": 385 }, { "epoch": 2.91, "learning_rate": 1.4044943820224718e-08, "logps/chosen": -27.77488136291504, "logps/rejected": -37.88126754760742, "loss": 0.5551, "losses/dpo": 0.5367317199707031, "losses/sft": 1.0271828174591064, "losses/total": 0.5367317199707031, "ref_logps/chosen": -22.3087158203125, "ref_logps/rejected": -27.471187591552734, "rewards/accuracies": 0.7578125, "rewards/chosen": -0.5466164350509644, "rewards/margins": 0.4943912625312805, "rewards/rejected": -1.0410076379776, "step": 386 }, { "epoch": 2.92, "learning_rate": 1.2640449438202247e-08, "logps/chosen": -28.72400665283203, "logps/rejected": -36.061241149902344, "loss": 0.5438, "losses/dpo": 0.5493422746658325, "losses/sft": 0.9023943543434143, "losses/total": 0.5493422746658325, "ref_logps/chosen": -23.460235595703125, "ref_logps/rejected": -26.00853729248047, "rewards/accuracies": 0.71875, "rewards/chosen": -0.526377260684967, "rewards/margins": 0.47889336943626404, "rewards/rejected": -1.0052706003189087, "step": 387 }, { "epoch": 2.93, "learning_rate": 1.1235955056179774e-08, "logps/chosen": -27.819026947021484, "logps/rejected": -37.490928649902344, "loss": 0.5852, "losses/dpo": 0.5147813558578491, "losses/sft": 0.8766761422157288, "losses/total": 0.5147813558578491, "ref_logps/chosen": -21.90268898010254, "ref_logps/rejected": -27.36888885498047, "rewards/accuracies": 0.6875, "rewards/chosen": -0.5916341543197632, "rewards/margins": 0.4205697774887085, "rewards/rejected": -1.0122039318084717, "step": 388 }, { "epoch": 2.94, "learning_rate": 9.831460674157303e-09, "logps/chosen": -26.303754806518555, "logps/rejected": -37.83194351196289, "loss": 0.524, "losses/dpo": 0.5489503741264343, "losses/sft": 0.9560513496398926, "losses/total": 0.5489503741264343, "ref_logps/chosen": -21.29248046875, "ref_logps/rejected": -27.238914489746094, "rewards/accuracies": 0.765625, "rewards/chosen": -0.5011276006698608, "rewards/margins": 0.5581751465797424, "rewards/rejected": -1.059302806854248, "step": 389 }, { "epoch": 2.94, "learning_rate": 8.42696629213483e-09, "logps/chosen": -26.287546157836914, "logps/rejected": -38.258975982666016, "loss": 0.5441, "losses/dpo": 0.5884628295898438, "losses/sft": 0.9961035251617432, "losses/total": 0.5884628295898438, "ref_logps/chosen": -21.199317932128906, "ref_logps/rejected": -27.755794525146484, "rewards/accuracies": 0.7265625, "rewards/chosen": -0.5088227391242981, "rewards/margins": 0.5414952635765076, "rewards/rejected": -1.0503180027008057, "step": 390 }, { "epoch": 2.95, "learning_rate": 7.022471910112359e-09, "logps/chosen": -29.260208129882812, "logps/rejected": -35.26235580444336, "loss": 0.5711, "losses/dpo": 0.6062160730361938, "losses/sft": 0.9891349673271179, "losses/total": 0.6062160730361938, "ref_logps/chosen": -24.228797912597656, "ref_logps/rejected": -25.79244613647461, "rewards/accuracies": 0.71875, "rewards/chosen": -0.5031411051750183, "rewards/margins": 0.44384992122650146, "rewards/rejected": -0.9469910264015198, "step": 391 }, { "epoch": 2.96, "learning_rate": 5.617977528089887e-09, "logps/chosen": -26.954505920410156, "logps/rejected": -38.197296142578125, "loss": 0.5188, "losses/dpo": 0.48179134726524353, "losses/sft": 1.0057315826416016, "losses/total": 0.48179134726524353, "ref_logps/chosen": -22.015995025634766, "ref_logps/rejected": -27.583335876464844, "rewards/accuracies": 0.796875, "rewards/chosen": -0.49385106563568115, "rewards/margins": 0.5675452351570129, "rewards/rejected": -1.0613962411880493, "step": 392 }, { "epoch": 2.97, "learning_rate": 4.213483146067415e-09, "logps/chosen": -25.941349029541016, "logps/rejected": -37.711891174316406, "loss": 0.5126, "losses/dpo": 0.47302547097206116, "losses/sft": 1.0042707920074463, "losses/total": 0.47302547097206116, "ref_logps/chosen": -21.42403793334961, "ref_logps/rejected": -27.090473175048828, "rewards/accuracies": 0.703125, "rewards/chosen": -0.45173099637031555, "rewards/margins": 0.610410749912262, "rewards/rejected": -1.0621416568756104, "step": 393 }, { "epoch": 2.97, "learning_rate": 2.8089887640449435e-09, "logps/chosen": -27.533342361450195, "logps/rejected": -40.14276123046875, "loss": 0.5282, "losses/dpo": 0.47439950704574585, "losses/sft": 1.004162073135376, "losses/total": 0.47439950704574585, "ref_logps/chosen": -22.44705581665039, "ref_logps/rejected": -29.271793365478516, "rewards/accuracies": 0.765625, "rewards/chosen": -0.5086286067962646, "rewards/margins": 0.5784677267074585, "rewards/rejected": -1.0870963335037231, "step": 394 }, { "epoch": 2.98, "learning_rate": 1.4044943820224717e-09, "logps/chosen": -27.44398307800293, "logps/rejected": -38.508323669433594, "loss": 0.5377, "losses/dpo": 0.5113502740859985, "losses/sft": 1.0710563659667969, "losses/total": 0.5113502740859985, "ref_logps/chosen": -22.568340301513672, "ref_logps/rejected": -28.23776626586914, "rewards/accuracies": 0.71875, "rewards/chosen": -0.4875642657279968, "rewards/margins": 0.5394913554191589, "rewards/rejected": -1.0270556211471558, "step": 395 }, { "epoch": 2.99, "learning_rate": 0.0, "logps/chosen": -28.845203399658203, "logps/rejected": -36.77953338623047, "loss": 0.5692, "losses/dpo": 0.7008877992630005, "losses/sft": 1.1200252771377563, "losses/total": 0.7008877992630005, "ref_logps/chosen": -23.59469985961914, "ref_logps/rejected": -27.211450576782227, "rewards/accuracies": 0.703125, "rewards/chosen": -0.5250504612922668, "rewards/margins": 0.4317581057548523, "rewards/rejected": -0.9568085670471191, "step": 396 }, { "epoch": 2.99, "step": 396, "total_flos": 0.0, "train_loss": 0.6152852120423558, "train_runtime": 11562.7876, "train_samples_per_second": 4.4, "train_steps_per_second": 0.034 } ], "logging_steps": 1.0, "max_steps": 396, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 200, "total_flos": 0.0, "train_batch_size": 4, "trial_name": null, "trial_params": null }