LLaVA-Hound-DPO / trainer_state.json
ruohongz's picture
Upload ./trainer_state.json with huggingface_hub
84653dd verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 2.988679245283019,
"eval_steps": 500,
"global_step": 396,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.01,
"learning_rate": 1.25e-08,
"logps/chosen": -22.472335815429688,
"logps/rejected": -25.36812400817871,
"loss": 0.6931,
"losses/dpo": 0.6931471824645996,
"losses/sft": 0.7711470723152161,
"losses/total": 0.6931471824645996,
"ref_logps/chosen": -22.472335815429688,
"ref_logps/rejected": -25.36812400817871,
"rewards/accuracies": 0.0,
"rewards/chosen": 0.0,
"rewards/margins": 0.0,
"rewards/rejected": 0.0,
"step": 1
},
{
"epoch": 0.02,
"learning_rate": 2.5e-08,
"logps/chosen": -21.278339385986328,
"logps/rejected": -25.130128860473633,
"loss": 0.6931,
"losses/dpo": 0.6931471824645996,
"losses/sft": 0.8523496985435486,
"losses/total": 0.6931471824645996,
"ref_logps/chosen": -21.278339385986328,
"ref_logps/rejected": -25.130128860473633,
"rewards/accuracies": 0.0,
"rewards/chosen": 0.0,
"rewards/margins": 0.0,
"rewards/rejected": 0.0,
"step": 2
},
{
"epoch": 0.02,
"learning_rate": 3.75e-08,
"logps/chosen": -21.53506851196289,
"logps/rejected": -26.44188690185547,
"loss": 0.693,
"losses/dpo": 0.6928481459617615,
"losses/sft": 0.6631997227668762,
"losses/total": 0.6928481459617615,
"ref_logps/chosen": -21.54958152770996,
"ref_logps/rejected": -26.452028274536133,
"rewards/accuracies": 0.515625,
"rewards/chosen": 0.001451290212571621,
"rewards/margins": 0.00043702672701328993,
"rewards/rejected": 0.0010142631363123655,
"step": 3
},
{
"epoch": 0.03,
"learning_rate": 5e-08,
"logps/chosen": -21.846920013427734,
"logps/rejected": -26.232192993164062,
"loss": 0.6935,
"losses/dpo": 0.6933612823486328,
"losses/sft": 0.819932758808136,
"losses/total": 0.6933612823486328,
"ref_logps/chosen": -21.842269897460938,
"ref_logps/rejected": -26.234174728393555,
"rewards/accuracies": 0.4609375,
"rewards/chosen": -0.00046504498459398746,
"rewards/margins": -0.000663207727484405,
"rewards/rejected": 0.0001981628010980785,
"step": 4
},
{
"epoch": 0.04,
"learning_rate": 6.25e-08,
"logps/chosen": -23.82025146484375,
"logps/rejected": -26.558738708496094,
"loss": 0.693,
"losses/dpo": 0.6929464340209961,
"losses/sft": 0.7624120712280273,
"losses/total": 0.6929464340209961,
"ref_logps/chosen": -23.817665100097656,
"ref_logps/rejected": -26.55132293701172,
"rewards/accuracies": 0.5234375,
"rewards/chosen": -0.00025857496075332165,
"rewards/margins": 0.0004831284750252962,
"rewards/rejected": -0.0007417035521939397,
"step": 5
},
{
"epoch": 0.05,
"learning_rate": 7.5e-08,
"logps/chosen": -25.088871002197266,
"logps/rejected": -29.653806686401367,
"loss": 0.6923,
"losses/dpo": 0.6934427618980408,
"losses/sft": 0.7273141741752625,
"losses/total": 0.6934427618980408,
"ref_logps/chosen": -25.0992431640625,
"ref_logps/rejected": -29.64551544189453,
"rewards/accuracies": 0.6015625,
"rewards/chosen": 0.0010370061499997973,
"rewards/margins": 0.0018662326037883759,
"rewards/rejected": -0.0008292265702039003,
"step": 6
},
{
"epoch": 0.05,
"learning_rate": 8.75e-08,
"logps/chosen": -23.075027465820312,
"logps/rejected": -27.50556182861328,
"loss": 0.693,
"losses/dpo": 0.6948896646499634,
"losses/sft": 0.6432714462280273,
"losses/total": 0.6948896646499634,
"ref_logps/chosen": -23.066946029663086,
"ref_logps/rejected": -27.4930362701416,
"rewards/accuracies": 0.4765625,
"rewards/chosen": -0.0008082209387794137,
"rewards/margins": 0.0004443599027581513,
"rewards/rejected": -0.001252580899745226,
"step": 7
},
{
"epoch": 0.06,
"learning_rate": 1e-07,
"logps/chosen": -21.430335998535156,
"logps/rejected": -29.949260711669922,
"loss": 0.6933,
"losses/dpo": 0.6911635398864746,
"losses/sft": 0.8042243123054504,
"losses/total": 0.6911635398864746,
"ref_logps/chosen": -21.44394302368164,
"ref_logps/rejected": -29.96406364440918,
"rewards/accuracies": 0.4921875,
"rewards/chosen": 0.0013606649590656161,
"rewards/margins": -0.00011985772289335728,
"rewards/rejected": 0.0014805227983742952,
"step": 8
},
{
"epoch": 0.07,
"learning_rate": 1.125e-07,
"logps/chosen": -23.053390502929688,
"logps/rejected": -27.866111755371094,
"loss": 0.6923,
"losses/dpo": 0.6914368271827698,
"losses/sft": 0.8787165284156799,
"losses/total": 0.6914368271827698,
"ref_logps/chosen": -23.060134887695312,
"ref_logps/rejected": -27.85537338256836,
"rewards/accuracies": 0.5546875,
"rewards/chosen": 0.0006745259161107242,
"rewards/margins": 0.0017485294956713915,
"rewards/rejected": -0.0010740034049376845,
"step": 9
},
{
"epoch": 0.08,
"learning_rate": 1.25e-07,
"logps/chosen": -23.637466430664062,
"logps/rejected": -29.587308883666992,
"loss": 0.6922,
"losses/dpo": 0.690066397190094,
"losses/sft": 1.0419297218322754,
"losses/total": 0.690066397190094,
"ref_logps/chosen": -23.649028778076172,
"ref_logps/rejected": -29.579374313354492,
"rewards/accuracies": 0.546875,
"rewards/chosen": 0.0011563875013962388,
"rewards/margins": 0.0019498697947710752,
"rewards/rejected": -0.0007934823515824974,
"step": 10
},
{
"epoch": 0.08,
"learning_rate": 1.375e-07,
"logps/chosen": -22.38899040222168,
"logps/rejected": -24.971160888671875,
"loss": 0.6926,
"losses/dpo": 0.6951523423194885,
"losses/sft": 0.9443475008010864,
"losses/total": 0.6951523423194885,
"ref_logps/chosen": -22.398780822753906,
"ref_logps/rejected": -24.969751358032227,
"rewards/accuracies": 0.5078125,
"rewards/chosen": 0.000978996278718114,
"rewards/margins": 0.0011200353037565947,
"rewards/rejected": -0.00014103890862315893,
"step": 11
},
{
"epoch": 0.09,
"learning_rate": 1.5e-07,
"logps/chosen": -20.165252685546875,
"logps/rejected": -26.619457244873047,
"loss": 0.6946,
"losses/dpo": 0.6987805962562561,
"losses/sft": 0.876471221446991,
"losses/total": 0.6987805962562561,
"ref_logps/chosen": -20.14897918701172,
"ref_logps/rejected": -26.63131332397461,
"rewards/accuracies": 0.421875,
"rewards/chosen": -0.001627539866603911,
"rewards/margins": -0.002813115483149886,
"rewards/rejected": 0.0011855755001306534,
"step": 12
},
{
"epoch": 0.1,
"learning_rate": 1.625e-07,
"logps/chosen": -25.07573699951172,
"logps/rejected": -25.939855575561523,
"loss": 0.6936,
"losses/dpo": 0.6952416896820068,
"losses/sft": 0.9322817325592041,
"losses/total": 0.6952416896820068,
"ref_logps/chosen": -25.075220108032227,
"ref_logps/rejected": -25.947521209716797,
"rewards/accuracies": 0.515625,
"rewards/chosen": -5.185510963201523e-05,
"rewards/margins": -0.0008183673489838839,
"rewards/rejected": 0.0007665121229365468,
"step": 13
},
{
"epoch": 0.11,
"learning_rate": 1.75e-07,
"logps/chosen": -22.58213233947754,
"logps/rejected": -27.590843200683594,
"loss": 0.692,
"losses/dpo": 0.6901522874832153,
"losses/sft": 0.8234641551971436,
"losses/total": 0.6901522874832153,
"ref_logps/chosen": -22.58617401123047,
"ref_logps/rejected": -27.570602416992188,
"rewards/accuracies": 0.578125,
"rewards/chosen": 0.00040407240157946944,
"rewards/margins": 0.002427991945296526,
"rewards/rejected": -0.0020239197183400393,
"step": 14
},
{
"epoch": 0.11,
"learning_rate": 1.875e-07,
"logps/chosen": -23.004196166992188,
"logps/rejected": -25.858173370361328,
"loss": 0.6925,
"losses/dpo": 0.6923660039901733,
"losses/sft": 0.7345502376556396,
"losses/total": 0.6923660039901733,
"ref_logps/chosen": -23.010601043701172,
"ref_logps/rejected": -25.85067367553711,
"rewards/accuracies": 0.515625,
"rewards/chosen": 0.0006403709994629025,
"rewards/margins": 0.0013903947547078133,
"rewards/rejected": -0.0007500239298678935,
"step": 15
},
{
"epoch": 0.12,
"learning_rate": 2e-07,
"logps/chosen": -21.546062469482422,
"logps/rejected": -25.777360916137695,
"loss": 0.6931,
"losses/dpo": 0.6901232004165649,
"losses/sft": 0.8039647936820984,
"losses/total": 0.6901232004165649,
"ref_logps/chosen": -21.53840446472168,
"ref_logps/rejected": -25.766767501831055,
"rewards/accuracies": 0.5390625,
"rewards/chosen": -0.0007656853413209319,
"rewards/margins": 0.0002937153331004083,
"rewards/rejected": -0.0010594006162136793,
"step": 16
},
{
"epoch": 0.13,
"learning_rate": 2.1249999999999998e-07,
"logps/chosen": -22.206989288330078,
"logps/rejected": -27.877731323242188,
"loss": 0.6937,
"losses/dpo": 0.6932737827301025,
"losses/sft": 0.7667961716651917,
"losses/total": 0.6932737827301025,
"ref_logps/chosen": -22.19771957397461,
"ref_logps/rejected": -27.87958335876465,
"rewards/accuracies": 0.4453125,
"rewards/chosen": -0.0009270801674574614,
"rewards/margins": -0.0011123311705887318,
"rewards/rejected": 0.0001852509449236095,
"step": 17
},
{
"epoch": 0.14,
"learning_rate": 2.25e-07,
"logps/chosen": -21.215139389038086,
"logps/rejected": -25.75381088256836,
"loss": 0.693,
"losses/dpo": 0.6932240724563599,
"losses/sft": 0.736687421798706,
"losses/total": 0.6932240724563599,
"ref_logps/chosen": -21.212387084960938,
"ref_logps/rejected": -25.746326446533203,
"rewards/accuracies": 0.5,
"rewards/chosen": -0.00027507508639246225,
"rewards/margins": 0.000473553518531844,
"rewards/rejected": -0.0007486287504434586,
"step": 18
},
{
"epoch": 0.14,
"learning_rate": 2.3749999999999998e-07,
"logps/chosen": -22.499832153320312,
"logps/rejected": -26.145751953125,
"loss": 0.6932,
"losses/dpo": 0.6942628622055054,
"losses/sft": 0.7466978430747986,
"losses/total": 0.6942628622055054,
"ref_logps/chosen": -22.496463775634766,
"ref_logps/rejected": -26.141849517822266,
"rewards/accuracies": 0.484375,
"rewards/chosen": -0.0003368390607647598,
"rewards/margins": 5.3280091378837824e-05,
"rewards/rejected": -0.0003901191521435976,
"step": 19
},
{
"epoch": 0.15,
"learning_rate": 2.5e-07,
"logps/chosen": -21.5505428314209,
"logps/rejected": -25.036113739013672,
"loss": 0.693,
"losses/dpo": 0.688271164894104,
"losses/sft": 0.8725596070289612,
"losses/total": 0.688271164894104,
"ref_logps/chosen": -21.558109283447266,
"ref_logps/rejected": -25.038726806640625,
"rewards/accuracies": 0.5,
"rewards/chosen": 0.0007566105341538787,
"rewards/margins": 0.0004949538852088153,
"rewards/rejected": 0.0002616568235680461,
"step": 20
},
{
"epoch": 0.16,
"learning_rate": 2.625e-07,
"logps/chosen": -21.649169921875,
"logps/rejected": -25.741392135620117,
"loss": 0.6917,
"losses/dpo": 0.6939514875411987,
"losses/sft": 0.7525328993797302,
"losses/total": 0.6939514875411987,
"ref_logps/chosen": -21.666126251220703,
"ref_logps/rejected": -25.72817611694336,
"rewards/accuracies": 0.640625,
"rewards/chosen": 0.0016953760059550405,
"rewards/margins": 0.003016936592757702,
"rewards/rejected": -0.0013215603539720178,
"step": 21
},
{
"epoch": 0.17,
"learning_rate": 2.75e-07,
"logps/chosen": -21.422496795654297,
"logps/rejected": -26.453773498535156,
"loss": 0.695,
"losses/dpo": 0.699163019657135,
"losses/sft": 0.7248706221580505,
"losses/total": 0.699163019657135,
"ref_logps/chosen": -21.396032333374023,
"ref_logps/rejected": -26.464006423950195,
"rewards/accuracies": 0.4296875,
"rewards/chosen": -0.002646287204697728,
"rewards/margins": -0.003669553902000189,
"rewards/rejected": 0.0010232668137177825,
"step": 22
},
{
"epoch": 0.17,
"learning_rate": 2.8749999999999995e-07,
"logps/chosen": -21.21988868713379,
"logps/rejected": -25.13469886779785,
"loss": 0.6929,
"losses/dpo": 0.6908746957778931,
"losses/sft": 0.7899657487869263,
"losses/total": 0.6908746957778931,
"ref_logps/chosen": -21.22311782836914,
"ref_logps/rejected": -25.131580352783203,
"rewards/accuracies": 0.515625,
"rewards/chosen": 0.0003229643334634602,
"rewards/margins": 0.0006348754977807403,
"rewards/rejected": -0.00031191116431728005,
"step": 23
},
{
"epoch": 0.18,
"learning_rate": 3e-07,
"logps/chosen": -24.172225952148438,
"logps/rejected": -27.93877410888672,
"loss": 0.6936,
"losses/dpo": 0.6931849718093872,
"losses/sft": 0.7270597219467163,
"losses/total": 0.6931849718093872,
"ref_logps/chosen": -24.16461944580078,
"ref_logps/rejected": -27.940391540527344,
"rewards/accuracies": 0.4765625,
"rewards/chosen": -0.000760397466365248,
"rewards/margins": -0.0009220357751473784,
"rewards/rejected": 0.00016163833788596094,
"step": 24
},
{
"epoch": 0.19,
"learning_rate": 3.1249999999999997e-07,
"logps/chosen": -23.023677825927734,
"logps/rejected": -23.77918243408203,
"loss": 0.6929,
"losses/dpo": 0.6930486559867859,
"losses/sft": 0.779391884803772,
"losses/total": 0.6930486559867859,
"ref_logps/chosen": -23.028684616088867,
"ref_logps/rejected": -23.778560638427734,
"rewards/accuracies": 0.546875,
"rewards/chosen": 0.0005005812272429466,
"rewards/margins": 0.0005629429360851645,
"rewards/rejected": -6.23615924268961e-05,
"step": 25
},
{
"epoch": 0.2,
"learning_rate": 3.25e-07,
"logps/chosen": -24.240978240966797,
"logps/rejected": -30.183570861816406,
"loss": 0.6923,
"losses/dpo": 0.6919558644294739,
"losses/sft": 0.8828473091125488,
"losses/total": 0.6919558644294739,
"ref_logps/chosen": -24.253870010375977,
"ref_logps/rejected": -30.17804718017578,
"rewards/accuracies": 0.5078125,
"rewards/chosen": 0.001289202249608934,
"rewards/margins": 0.0018417320679873228,
"rewards/rejected": -0.0005525298183783889,
"step": 26
},
{
"epoch": 0.2,
"learning_rate": 3.375e-07,
"logps/chosen": -22.371261596679688,
"logps/rejected": -28.10503387451172,
"loss": 0.6911,
"losses/dpo": 0.6919010281562805,
"losses/sft": 0.9361266493797302,
"losses/total": 0.6919010281562805,
"ref_logps/chosen": -22.4020938873291,
"ref_logps/rejected": -28.094257354736328,
"rewards/accuracies": 0.59375,
"rewards/chosen": 0.0030831946060061455,
"rewards/margins": 0.004160974640399218,
"rewards/rejected": -0.001077780150808394,
"step": 27
},
{
"epoch": 0.21,
"learning_rate": 3.5e-07,
"logps/chosen": -21.107967376708984,
"logps/rejected": -27.053752899169922,
"loss": 0.6921,
"losses/dpo": 0.6916664838790894,
"losses/sft": 0.8491181135177612,
"losses/total": 0.6916664838790894,
"ref_logps/chosen": -21.1080379486084,
"ref_logps/rejected": -27.03229331970215,
"rewards/accuracies": 0.5390625,
"rewards/chosen": 7.087946869432926e-06,
"rewards/margins": 0.0021530785597860813,
"rewards/rejected": -0.00214599072933197,
"step": 28
},
{
"epoch": 0.22,
"learning_rate": 3.6249999999999997e-07,
"logps/chosen": -23.424461364746094,
"logps/rejected": -27.092483520507812,
"loss": 0.6906,
"losses/dpo": 0.6926239728927612,
"losses/sft": 0.7789149284362793,
"losses/total": 0.6926239728927612,
"ref_logps/chosen": -23.46218490600586,
"ref_logps/rejected": -27.07909393310547,
"rewards/accuracies": 0.59375,
"rewards/chosen": 0.0037725979927927256,
"rewards/margins": 0.005111560225486755,
"rewards/rejected": -0.0013389625819399953,
"step": 29
},
{
"epoch": 0.23,
"learning_rate": 3.75e-07,
"logps/chosen": -22.859556198120117,
"logps/rejected": -27.201662063598633,
"loss": 0.6933,
"losses/dpo": 0.6948127746582031,
"losses/sft": 0.7969105243682861,
"losses/total": 0.6948127746582031,
"ref_logps/chosen": -22.869096755981445,
"ref_logps/rejected": -27.212430953979492,
"rewards/accuracies": 0.515625,
"rewards/chosen": 0.0009542852640151978,
"rewards/margins": -0.00012272456660866737,
"rewards/rejected": 0.001077009947039187,
"step": 30
},
{
"epoch": 0.23,
"learning_rate": 3.875e-07,
"logps/chosen": -22.666168212890625,
"logps/rejected": -25.310596466064453,
"loss": 0.6918,
"losses/dpo": 0.6922581195831299,
"losses/sft": 0.7759775519371033,
"losses/total": 0.6922581195831299,
"ref_logps/chosen": -22.68026351928711,
"ref_logps/rejected": -25.297521591186523,
"rewards/accuracies": 0.5625,
"rewards/chosen": 0.001409594900906086,
"rewards/margins": 0.002717201365157962,
"rewards/rejected": -0.001307606347836554,
"step": 31
},
{
"epoch": 0.24,
"learning_rate": 4e-07,
"logps/chosen": -23.281084060668945,
"logps/rejected": -28.84569549560547,
"loss": 0.693,
"losses/dpo": 0.6980300545692444,
"losses/sft": 0.7636886835098267,
"losses/total": 0.6980300545692444,
"ref_logps/chosen": -23.299869537353516,
"ref_logps/rejected": -28.859834671020508,
"rewards/accuracies": 0.5390625,
"rewards/chosen": 0.0018782642437145114,
"rewards/margins": 0.000464284501504153,
"rewards/rejected": 0.0014139798004180193,
"step": 32
},
{
"epoch": 0.25,
"learning_rate": 4.1249999999999997e-07,
"logps/chosen": -20.922544479370117,
"logps/rejected": -27.139453887939453,
"loss": 0.6914,
"losses/dpo": 0.6892759799957275,
"losses/sft": 0.7832686901092529,
"losses/total": 0.6892759799957275,
"ref_logps/chosen": -20.949806213378906,
"ref_logps/rejected": -27.13178253173828,
"rewards/accuracies": 0.6484375,
"rewards/chosen": 0.0027261325158178806,
"rewards/margins": 0.003493295982480049,
"rewards/rejected": -0.0007671635248698294,
"step": 33
},
{
"epoch": 0.26,
"learning_rate": 4.2499999999999995e-07,
"logps/chosen": -22.535436630249023,
"logps/rejected": -26.6143798828125,
"loss": 0.6926,
"losses/dpo": 0.6938276290893555,
"losses/sft": 0.7895969152450562,
"losses/total": 0.6938276290893555,
"ref_logps/chosen": -22.540180206298828,
"ref_logps/rejected": -26.607288360595703,
"rewards/accuracies": 0.53125,
"rewards/chosen": 0.0004745282931253314,
"rewards/margins": 0.0011834825854748487,
"rewards/rejected": -0.0007089540013112128,
"step": 34
},
{
"epoch": 0.26,
"learning_rate": 4.375e-07,
"logps/chosen": -21.444934844970703,
"logps/rejected": -27.329378128051758,
"loss": 0.6928,
"losses/dpo": 0.6910836100578308,
"losses/sft": 0.7998620271682739,
"losses/total": 0.6910836100578308,
"ref_logps/chosen": -21.460729598999023,
"ref_logps/rejected": -27.336944580078125,
"rewards/accuracies": 0.515625,
"rewards/chosen": 0.0015797324012964964,
"rewards/margins": 0.0008232423570007086,
"rewards/rejected": 0.0007564900442957878,
"step": 35
},
{
"epoch": 0.27,
"learning_rate": 4.5e-07,
"logps/chosen": -22.847640991210938,
"logps/rejected": -26.22686195373535,
"loss": 0.6938,
"losses/dpo": 0.6915764808654785,
"losses/sft": 0.7927474975585938,
"losses/total": 0.6915764808654785,
"ref_logps/chosen": -22.84987449645996,
"ref_logps/rejected": -26.240699768066406,
"rewards/accuracies": 0.4375,
"rewards/chosen": 0.00022311191423796117,
"rewards/margins": -0.001160716055892408,
"rewards/rejected": 0.0013838279992341995,
"step": 36
},
{
"epoch": 0.28,
"learning_rate": 4.625e-07,
"logps/chosen": -23.097599029541016,
"logps/rejected": -25.179964065551758,
"loss": 0.6925,
"losses/dpo": 0.6903287768363953,
"losses/sft": 0.8005999326705933,
"losses/total": 0.6903287768363953,
"ref_logps/chosen": -23.103515625,
"ref_logps/rejected": -25.171833038330078,
"rewards/accuracies": 0.4921875,
"rewards/chosen": 0.000591703865211457,
"rewards/margins": 0.001405149232596159,
"rewards/rejected": -0.000813445309177041,
"step": 37
},
{
"epoch": 0.29,
"learning_rate": 4.7499999999999995e-07,
"logps/chosen": -23.07529640197754,
"logps/rejected": -26.14615821838379,
"loss": 0.6912,
"losses/dpo": 0.6978300213813782,
"losses/sft": 0.7380209565162659,
"losses/total": 0.6978300213813782,
"ref_logps/chosen": -23.113910675048828,
"ref_logps/rejected": -26.144914627075195,
"rewards/accuracies": 0.578125,
"rewards/chosen": 0.003861566074192524,
"rewards/margins": 0.003985891118645668,
"rewards/rejected": -0.00012432527728378773,
"step": 38
},
{
"epoch": 0.29,
"learning_rate": 4.875e-07,
"logps/chosen": -23.091575622558594,
"logps/rejected": -28.207073211669922,
"loss": 0.6933,
"losses/dpo": 0.6926023960113525,
"losses/sft": 0.7966833710670471,
"losses/total": 0.6926023960113525,
"ref_logps/chosen": -23.102182388305664,
"ref_logps/rejected": -28.21949577331543,
"rewards/accuracies": 0.5234375,
"rewards/chosen": 0.0010607184376567602,
"rewards/margins": -0.00018126872600987554,
"rewards/rejected": 0.0012419875711202621,
"step": 39
},
{
"epoch": 0.3,
"learning_rate": 5e-07,
"logps/chosen": -21.683151245117188,
"logps/rejected": -27.111900329589844,
"loss": 0.6903,
"losses/dpo": 0.6866278648376465,
"losses/sft": 0.887488842010498,
"losses/total": 0.6866278648376465,
"ref_logps/chosen": -21.714126586914062,
"ref_logps/rejected": -27.08427619934082,
"rewards/accuracies": 0.609375,
"rewards/chosen": 0.003097555134445429,
"rewards/margins": 0.0058600143529474735,
"rewards/rejected": -0.0027624592185020447,
"step": 40
},
{
"epoch": 0.31,
"learning_rate": 4.985955056179775e-07,
"logps/chosen": -23.24443817138672,
"logps/rejected": -24.057823181152344,
"loss": 0.6924,
"losses/dpo": 0.6903232336044312,
"losses/sft": 0.7454457879066467,
"losses/total": 0.6903232336044312,
"ref_logps/chosen": -23.264373779296875,
"ref_logps/rejected": -24.060710906982422,
"rewards/accuracies": 0.4765625,
"rewards/chosen": 0.0019934140145778656,
"rewards/margins": 0.0017045673448592424,
"rewards/rejected": 0.0002888469025492668,
"step": 41
},
{
"epoch": 0.32,
"learning_rate": 4.97191011235955e-07,
"logps/chosen": -22.751291275024414,
"logps/rejected": -23.993690490722656,
"loss": 0.692,
"losses/dpo": 0.6917561292648315,
"losses/sft": 0.8527467846870422,
"losses/total": 0.6917561292648315,
"ref_logps/chosen": -22.75712013244629,
"ref_logps/rejected": -23.975711822509766,
"rewards/accuracies": 0.5625,
"rewards/chosen": 0.0005830166628584266,
"rewards/margins": 0.0023808996193110943,
"rewards/rejected": -0.0017978833056986332,
"step": 42
},
{
"epoch": 0.32,
"learning_rate": 4.957865168539325e-07,
"logps/chosen": -24.575613021850586,
"logps/rejected": -27.22784996032715,
"loss": 0.6922,
"losses/dpo": 0.6912024021148682,
"losses/sft": 0.8869270086288452,
"losses/total": 0.6912024021148682,
"ref_logps/chosen": -24.60643196105957,
"ref_logps/rejected": -27.23748779296875,
"rewards/accuracies": 0.5234375,
"rewards/chosen": 0.0030817545484751463,
"rewards/margins": 0.002117899712175131,
"rewards/rejected": 0.0009638546616770327,
"step": 43
},
{
"epoch": 0.33,
"learning_rate": 4.943820224719101e-07,
"logps/chosen": -23.449739456176758,
"logps/rejected": -29.683177947998047,
"loss": 0.6913,
"losses/dpo": 0.690817654132843,
"losses/sft": 0.7518939971923828,
"losses/total": 0.690817654132843,
"ref_logps/chosen": -23.47886848449707,
"ref_logps/rejected": -29.673599243164062,
"rewards/accuracies": 0.5546875,
"rewards/chosen": 0.0029130401089787483,
"rewards/margins": 0.0038708222564309835,
"rewards/rejected": -0.0009577819146215916,
"step": 44
},
{
"epoch": 0.34,
"learning_rate": 4.929775280898877e-07,
"logps/chosen": -21.53199577331543,
"logps/rejected": -26.939178466796875,
"loss": 0.6923,
"losses/dpo": 0.6913425922393799,
"losses/sft": 0.6940815448760986,
"losses/total": 0.6913425922393799,
"ref_logps/chosen": -21.567256927490234,
"ref_logps/rejected": -26.955793380737305,
"rewards/accuracies": 0.5078125,
"rewards/chosen": 0.0035258703865110874,
"rewards/margins": 0.0018642698414623737,
"rewards/rejected": 0.0016616008942946792,
"step": 45
},
{
"epoch": 0.35,
"learning_rate": 4.915730337078651e-07,
"logps/chosen": -25.476314544677734,
"logps/rejected": -28.62994956970215,
"loss": 0.6903,
"losses/dpo": 0.6909126043319702,
"losses/sft": 0.9766503572463989,
"losses/total": 0.6909126043319702,
"ref_logps/chosen": -25.53481674194336,
"ref_logps/rejected": -28.629175186157227,
"rewards/accuracies": 0.5703125,
"rewards/chosen": 0.005850302986800671,
"rewards/margins": 0.005927846767008305,
"rewards/rejected": -7.754407124593854e-05,
"step": 46
},
{
"epoch": 0.35,
"learning_rate": 4.901685393258427e-07,
"logps/chosen": -24.225303649902344,
"logps/rejected": -25.871768951416016,
"loss": 0.6919,
"losses/dpo": 0.6921358704566956,
"losses/sft": 0.8468361496925354,
"losses/total": 0.6921358704566956,
"ref_logps/chosen": -24.27114486694336,
"ref_logps/rejected": -25.892141342163086,
"rewards/accuracies": 0.5625,
"rewards/chosen": 0.004584114067256451,
"rewards/margins": 0.0025467565283179283,
"rewards/rejected": 0.0020373575389385223,
"step": 47
},
{
"epoch": 0.36,
"learning_rate": 4.887640449438202e-07,
"logps/chosen": -21.586902618408203,
"logps/rejected": -27.604530334472656,
"loss": 0.6902,
"losses/dpo": 0.6886686086654663,
"losses/sft": 0.7169030904769897,
"losses/total": 0.6886686086654663,
"ref_logps/chosen": -21.621915817260742,
"ref_logps/rejected": -27.578147888183594,
"rewards/accuracies": 0.5703125,
"rewards/chosen": 0.003501205239444971,
"rewards/margins": 0.006139571778476238,
"rewards/rejected": -0.00263836607336998,
"step": 48
},
{
"epoch": 0.37,
"learning_rate": 4.873595505617978e-07,
"logps/chosen": -24.600910186767578,
"logps/rejected": -30.10862922668457,
"loss": 0.6909,
"losses/dpo": 0.6952996850013733,
"losses/sft": 0.7813842296600342,
"losses/total": 0.6952996850013733,
"ref_logps/chosen": -24.644847869873047,
"ref_logps/rejected": -30.104995727539062,
"rewards/accuracies": 0.5859375,
"rewards/chosen": 0.004394051153212786,
"rewards/margins": 0.004757395945489407,
"rewards/rejected": -0.0003633448213804513,
"step": 49
},
{
"epoch": 0.38,
"learning_rate": 4.859550561797752e-07,
"logps/chosen": -20.754308700561523,
"logps/rejected": -24.876815795898438,
"loss": 0.6914,
"losses/dpo": 0.6889626979827881,
"losses/sft": 0.8148602843284607,
"losses/total": 0.6889626979827881,
"ref_logps/chosen": -20.81591796875,
"ref_logps/rejected": -24.90121078491211,
"rewards/accuracies": 0.5,
"rewards/chosen": 0.006161023862659931,
"rewards/margins": 0.0037216043565422297,
"rewards/rejected": 0.0024394195061177015,
"step": 50
},
{
"epoch": 0.38,
"learning_rate": 4.845505617977528e-07,
"logps/chosen": -23.585115432739258,
"logps/rejected": -24.949783325195312,
"loss": 0.6915,
"losses/dpo": 0.6886854767799377,
"losses/sft": 0.8582803010940552,
"losses/total": 0.6886854767799377,
"ref_logps/chosen": -23.63630485534668,
"ref_logps/rejected": -24.965686798095703,
"rewards/accuracies": 0.5546875,
"rewards/chosen": 0.00511885154992342,
"rewards/margins": 0.00352850160561502,
"rewards/rejected": 0.0015903504099696875,
"step": 51
},
{
"epoch": 0.39,
"learning_rate": 4.831460674157303e-07,
"logps/chosen": -20.576318740844727,
"logps/rejected": -24.87842559814453,
"loss": 0.6916,
"losses/dpo": 0.6899633407592773,
"losses/sft": 0.6870510578155518,
"losses/total": 0.6899633407592773,
"ref_logps/chosen": -20.60286521911621,
"ref_logps/rejected": -24.87299346923828,
"rewards/accuracies": 0.5546875,
"rewards/chosen": 0.0026545142754912376,
"rewards/margins": 0.0031977419275790453,
"rewards/rejected": -0.0005432275356724858,
"step": 52
},
{
"epoch": 0.4,
"learning_rate": 4.817415730337078e-07,
"logps/chosen": -24.051544189453125,
"logps/rejected": -25.128353118896484,
"loss": 0.6887,
"losses/dpo": 0.6841185092926025,
"losses/sft": 0.833280622959137,
"losses/total": 0.6841185092926025,
"ref_logps/chosen": -24.10638427734375,
"ref_logps/rejected": -25.090627670288086,
"rewards/accuracies": 0.6171875,
"rewards/chosen": 0.005483907647430897,
"rewards/margins": 0.009256447665393353,
"rewards/rejected": -0.003772540483623743,
"step": 53
},
{
"epoch": 0.41,
"learning_rate": 4.803370786516854e-07,
"logps/chosen": -21.564958572387695,
"logps/rejected": -26.20134735107422,
"loss": 0.6914,
"losses/dpo": 0.687272846698761,
"losses/sft": 0.7218018770217896,
"losses/total": 0.687272846698761,
"ref_logps/chosen": -21.621246337890625,
"ref_logps/rejected": -26.221445083618164,
"rewards/accuracies": 0.546875,
"rewards/chosen": 0.00562882237136364,
"rewards/margins": 0.003618879709392786,
"rewards/rejected": 0.002009942661970854,
"step": 54
},
{
"epoch": 0.42,
"learning_rate": 4.789325842696629e-07,
"logps/chosen": -23.699432373046875,
"logps/rejected": -26.1567325592041,
"loss": 0.6884,
"losses/dpo": 0.6862033605575562,
"losses/sft": 0.9426325559616089,
"losses/total": 0.6862033605575562,
"ref_logps/chosen": -23.775989532470703,
"ref_logps/rejected": -26.134971618652344,
"rewards/accuracies": 0.5859375,
"rewards/chosen": 0.007655493449419737,
"rewards/margins": 0.009831697680056095,
"rewards/rejected": -0.0021762042306363583,
"step": 55
},
{
"epoch": 0.42,
"learning_rate": 4.775280898876405e-07,
"logps/chosen": -23.076374053955078,
"logps/rejected": -27.695213317871094,
"loss": 0.6881,
"losses/dpo": 0.6900283098220825,
"losses/sft": 0.8505688905715942,
"losses/total": 0.6900283098220825,
"ref_logps/chosen": -23.134254455566406,
"ref_logps/rejected": -27.64853858947754,
"rewards/accuracies": 0.6640625,
"rewards/chosen": 0.005787987262010574,
"rewards/margins": 0.010455346666276455,
"rewards/rejected": -0.004667359404265881,
"step": 56
},
{
"epoch": 0.43,
"learning_rate": 4.7612359550561797e-07,
"logps/chosen": -21.54006576538086,
"logps/rejected": -24.36727523803711,
"loss": 0.6911,
"losses/dpo": 0.6942879557609558,
"losses/sft": 0.7311047911643982,
"losses/total": 0.6942879557609558,
"ref_logps/chosen": -21.592029571533203,
"ref_logps/rejected": -24.37733268737793,
"rewards/accuracies": 0.6015625,
"rewards/chosen": 0.0051962630823254585,
"rewards/margins": 0.004190489184111357,
"rewards/rejected": 0.0010057740146294236,
"step": 57
},
{
"epoch": 0.44,
"learning_rate": 4.747191011235955e-07,
"logps/chosen": -21.678865432739258,
"logps/rejected": -28.501548767089844,
"loss": 0.6906,
"losses/dpo": 0.6889323592185974,
"losses/sft": 0.7590615749359131,
"losses/total": 0.6889323592185974,
"ref_logps/chosen": -21.72535514831543,
"ref_logps/rejected": -28.493976593017578,
"rewards/accuracies": 0.6328125,
"rewards/chosen": 0.004648969508707523,
"rewards/margins": 0.005405961070209742,
"rewards/rejected": -0.0007569912704639137,
"step": 58
},
{
"epoch": 0.45,
"learning_rate": 4.7331460674157303e-07,
"logps/chosen": -23.9781436920166,
"logps/rejected": -26.515047073364258,
"loss": 0.684,
"losses/dpo": 0.6820257902145386,
"losses/sft": 0.8394409418106079,
"losses/total": 0.6820257902145386,
"ref_logps/chosen": -24.077434539794922,
"ref_logps/rejected": -26.427589416503906,
"rewards/accuracies": 0.6875,
"rewards/chosen": 0.009928906336426735,
"rewards/margins": 0.01867445930838585,
"rewards/rejected": -0.008745552971959114,
"step": 59
},
{
"epoch": 0.45,
"learning_rate": 4.7191011235955054e-07,
"logps/chosen": -22.162433624267578,
"logps/rejected": -30.391559600830078,
"loss": 0.6894,
"losses/dpo": 0.6909818053245544,
"losses/sft": 0.7433596253395081,
"losses/total": 0.6909818053245544,
"ref_logps/chosen": -22.250009536743164,
"ref_logps/rejected": -30.40111541748047,
"rewards/accuracies": 0.5625,
"rewards/chosen": 0.00875765923410654,
"rewards/margins": 0.007802051026374102,
"rewards/rejected": 0.0009556080331094563,
"step": 60
},
{
"epoch": 0.46,
"learning_rate": 4.705056179775281e-07,
"logps/chosen": -24.088329315185547,
"logps/rejected": -26.851608276367188,
"loss": 0.6888,
"losses/dpo": 0.6858267188072205,
"losses/sft": 0.6961312294006348,
"losses/total": 0.6858267188072205,
"ref_logps/chosen": -24.163042068481445,
"ref_logps/rejected": -26.837688446044922,
"rewards/accuracies": 0.625,
"rewards/chosen": 0.007471038028597832,
"rewards/margins": 0.008863050490617752,
"rewards/rejected": -0.0013920125784352422,
"step": 61
},
{
"epoch": 0.47,
"learning_rate": 4.691011235955056e-07,
"logps/chosen": -23.13729476928711,
"logps/rejected": -28.607454299926758,
"loss": 0.6896,
"losses/dpo": 0.6952353715896606,
"losses/sft": 0.8425909280776978,
"losses/total": 0.6952353715896606,
"ref_logps/chosen": -23.206546783447266,
"ref_logps/rejected": -28.603229522705078,
"rewards/accuracies": 0.578125,
"rewards/chosen": 0.006925276480615139,
"rewards/margins": 0.007347787730395794,
"rewards/rejected": -0.00042251107515767217,
"step": 62
},
{
"epoch": 0.48,
"learning_rate": 4.6769662921348315e-07,
"logps/chosen": -22.758800506591797,
"logps/rejected": -25.503629684448242,
"loss": 0.6882,
"losses/dpo": 0.690306544303894,
"losses/sft": 0.7292711734771729,
"losses/total": 0.690306544303894,
"ref_logps/chosen": -22.867115020751953,
"ref_logps/rejected": -25.50885009765625,
"rewards/accuracies": 0.640625,
"rewards/chosen": 0.010831332765519619,
"rewards/margins": 0.010309312492609024,
"rewards/rejected": 0.0005220210296101868,
"step": 63
},
{
"epoch": 0.48,
"learning_rate": 4.662921348314606e-07,
"logps/chosen": -22.957290649414062,
"logps/rejected": -27.15595245361328,
"loss": 0.6868,
"losses/dpo": 0.6876275539398193,
"losses/sft": 0.9537997245788574,
"losses/total": 0.6876275539398193,
"ref_logps/chosen": -23.08481788635254,
"ref_logps/rejected": -27.15395736694336,
"rewards/accuracies": 0.6484375,
"rewards/chosen": 0.012752560898661613,
"rewards/margins": 0.012952261604368687,
"rewards/rejected": -0.0001996997743844986,
"step": 64
},
{
"epoch": 0.49,
"learning_rate": 4.6488764044943816e-07,
"logps/chosen": -21.856212615966797,
"logps/rejected": -28.90016746520996,
"loss": 0.688,
"losses/dpo": 0.6866365075111389,
"losses/sft": 0.748786211013794,
"losses/total": 0.6866365075111389,
"ref_logps/chosen": -21.946701049804688,
"ref_logps/rejected": -28.884090423583984,
"rewards/accuracies": 0.5546875,
"rewards/chosen": 0.009048780426383018,
"rewards/margins": 0.010656429454684258,
"rewards/rejected": -0.0016076482133939862,
"step": 65
},
{
"epoch": 0.5,
"learning_rate": 4.634831460674157e-07,
"logps/chosen": -21.727970123291016,
"logps/rejected": -24.484195709228516,
"loss": 0.6866,
"losses/dpo": 0.6858303546905518,
"losses/sft": 0.7428255677223206,
"losses/total": 0.6858303546905518,
"ref_logps/chosen": -21.827533721923828,
"ref_logps/rejected": -24.44991111755371,
"rewards/accuracies": 0.6484375,
"rewards/chosen": 0.009956244379281998,
"rewards/margins": 0.013384684920310974,
"rewards/rejected": -0.0034284412395209074,
"step": 66
},
{
"epoch": 0.51,
"learning_rate": 4.620786516853932e-07,
"logps/chosen": -23.145030975341797,
"logps/rejected": -25.03292465209961,
"loss": 0.685,
"losses/dpo": 0.6789939403533936,
"losses/sft": 0.718001127243042,
"losses/total": 0.6789939403533936,
"ref_logps/chosen": -23.27937889099121,
"ref_logps/rejected": -24.999483108520508,
"rewards/accuracies": 0.671875,
"rewards/chosen": 0.013434557244181633,
"rewards/margins": 0.016778716817498207,
"rewards/rejected": -0.0033441600389778614,
"step": 67
},
{
"epoch": 0.51,
"learning_rate": 4.606741573033708e-07,
"logps/chosen": -21.208370208740234,
"logps/rejected": -25.74646759033203,
"loss": 0.6852,
"losses/dpo": 0.6921157836914062,
"losses/sft": 0.8621765971183777,
"losses/total": 0.6921157836914062,
"ref_logps/chosen": -21.325489044189453,
"ref_logps/rejected": -25.700342178344727,
"rewards/accuracies": 0.640625,
"rewards/chosen": 0.011711984872817993,
"rewards/margins": 0.016324326395988464,
"rewards/rejected": -0.0046123419888317585,
"step": 68
},
{
"epoch": 0.52,
"learning_rate": 4.592696629213483e-07,
"logps/chosen": -22.621421813964844,
"logps/rejected": -28.81465721130371,
"loss": 0.6885,
"losses/dpo": 0.689292848110199,
"losses/sft": 0.7215853929519653,
"losses/total": 0.689292848110199,
"ref_logps/chosen": -22.720117568969727,
"ref_logps/rejected": -28.81524658203125,
"rewards/accuracies": 0.609375,
"rewards/chosen": 0.009869576431810856,
"rewards/margins": 0.009810445830225945,
"rewards/rejected": 5.913013592362404e-05,
"step": 69
},
{
"epoch": 0.53,
"learning_rate": 4.5786516853932584e-07,
"logps/chosen": -22.636703491210938,
"logps/rejected": -28.595046997070312,
"loss": 0.6872,
"losses/dpo": 0.6876038312911987,
"losses/sft": 0.7616434097290039,
"losses/total": 0.6876038312911987,
"ref_logps/chosen": -22.73769187927246,
"ref_logps/rejected": -28.57284164428711,
"rewards/accuracies": 0.6328125,
"rewards/chosen": 0.01009867899119854,
"rewards/margins": 0.01231930311769247,
"rewards/rejected": -0.002220625290647149,
"step": 70
},
{
"epoch": 0.54,
"learning_rate": 4.5646067415730334e-07,
"logps/chosen": -23.055517196655273,
"logps/rejected": -28.524490356445312,
"loss": 0.6849,
"losses/dpo": 0.6818934082984924,
"losses/sft": 0.8828948736190796,
"losses/total": 0.6818934082984924,
"ref_logps/chosen": -23.18179702758789,
"ref_logps/rejected": -28.479928970336914,
"rewards/accuracies": 0.6796875,
"rewards/chosen": 0.012628016993403435,
"rewards/margins": 0.01708414778113365,
"rewards/rejected": -0.004456131719052792,
"step": 71
},
{
"epoch": 0.54,
"learning_rate": 4.550561797752809e-07,
"logps/chosen": -25.802350997924805,
"logps/rejected": -29.403223037719727,
"loss": 0.682,
"losses/dpo": 0.6922101974487305,
"losses/sft": 0.7417640089988708,
"losses/total": 0.6922101974487305,
"ref_logps/chosen": -25.971485137939453,
"ref_logps/rejected": -29.342666625976562,
"rewards/accuracies": 0.6875,
"rewards/chosen": 0.016913428902626038,
"rewards/margins": 0.02296869084239006,
"rewards/rejected": -0.006055259145796299,
"step": 72
},
{
"epoch": 0.55,
"learning_rate": 4.536516853932584e-07,
"logps/chosen": -22.979541778564453,
"logps/rejected": -31.861392974853516,
"loss": 0.6849,
"losses/dpo": 0.6843876242637634,
"losses/sft": 0.6335030198097229,
"losses/total": 0.6843876242637634,
"ref_logps/chosen": -23.086105346679688,
"ref_logps/rejected": -31.796035766601562,
"rewards/accuracies": 0.609375,
"rewards/chosen": 0.010656386613845825,
"rewards/margins": 0.017192194238305092,
"rewards/rejected": -0.006535808090120554,
"step": 73
},
{
"epoch": 0.56,
"learning_rate": 4.522471910112359e-07,
"logps/chosen": -21.333240509033203,
"logps/rejected": -25.32451629638672,
"loss": 0.6842,
"losses/dpo": 0.6832489967346191,
"losses/sft": 0.8737274408340454,
"losses/total": 0.6832489967346191,
"ref_logps/chosen": -21.456268310546875,
"ref_logps/rejected": -25.263538360595703,
"rewards/accuracies": 0.703125,
"rewards/chosen": 0.012302841059863567,
"rewards/margins": 0.01840106211602688,
"rewards/rejected": -0.006098220124840736,
"step": 74
},
{
"epoch": 0.57,
"learning_rate": 4.5084269662921347e-07,
"logps/chosen": -21.905548095703125,
"logps/rejected": -25.504837036132812,
"loss": 0.6845,
"losses/dpo": 0.6803750991821289,
"losses/sft": 0.7227590084075928,
"losses/total": 0.6803750991821289,
"ref_logps/chosen": -22.001012802124023,
"ref_logps/rejected": -25.422731399536133,
"rewards/accuracies": 0.6953125,
"rewards/chosen": 0.009546317160129547,
"rewards/margins": 0.01775689423084259,
"rewards/rejected": -0.008210576139390469,
"step": 75
},
{
"epoch": 0.57,
"learning_rate": 4.4943820224719097e-07,
"logps/chosen": -22.212453842163086,
"logps/rejected": -25.56966209411621,
"loss": 0.6845,
"losses/dpo": 0.6879241466522217,
"losses/sft": 0.936349093914032,
"losses/total": 0.6879241466522217,
"ref_logps/chosen": -22.337223052978516,
"ref_logps/rejected": -25.51331901550293,
"rewards/accuracies": 0.625,
"rewards/chosen": 0.012477071955800056,
"rewards/margins": 0.018111376091837883,
"rewards/rejected": -0.005634305067360401,
"step": 76
},
{
"epoch": 0.58,
"learning_rate": 4.4803370786516853e-07,
"logps/chosen": -20.199138641357422,
"logps/rejected": -26.30996322631836,
"loss": 0.6818,
"losses/dpo": 0.6872521638870239,
"losses/sft": 0.6872013211250305,
"losses/total": 0.6872521638870239,
"ref_logps/chosen": -20.368690490722656,
"ref_logps/rejected": -26.24540138244629,
"rewards/accuracies": 0.6875,
"rewards/chosen": 0.016955075785517693,
"rewards/margins": 0.02341129444539547,
"rewards/rejected": -0.006456219125539064,
"step": 77
},
{
"epoch": 0.59,
"learning_rate": 4.4662921348314603e-07,
"logps/chosen": -22.031774520874023,
"logps/rejected": -26.07961082458496,
"loss": 0.6813,
"losses/dpo": 0.6833238005638123,
"losses/sft": 0.7775546312332153,
"losses/total": 0.6833238005638123,
"ref_logps/chosen": -22.163204193115234,
"ref_logps/rejected": -25.965608596801758,
"rewards/accuracies": 0.7109375,
"rewards/chosen": 0.01314287818968296,
"rewards/margins": 0.02454320341348648,
"rewards/rejected": -0.011400324292480946,
"step": 78
},
{
"epoch": 0.6,
"learning_rate": 4.452247191011236e-07,
"logps/chosen": -22.522083282470703,
"logps/rejected": -26.621906280517578,
"loss": 0.6801,
"losses/dpo": 0.6835525035858154,
"losses/sft": 0.7558909058570862,
"losses/total": 0.6835525035858154,
"ref_logps/chosen": -22.656497955322266,
"ref_logps/rejected": -26.48514747619629,
"rewards/accuracies": 0.6875,
"rewards/chosen": 0.013441269285976887,
"rewards/margins": 0.027117114514112473,
"rewards/rejected": -0.013675847090780735,
"step": 79
},
{
"epoch": 0.6,
"learning_rate": 4.438202247191011e-07,
"logps/chosen": -22.05775260925293,
"logps/rejected": -26.428781509399414,
"loss": 0.6836,
"losses/dpo": 0.6767468452453613,
"losses/sft": 0.8101401329040527,
"losses/total": 0.6767468452453613,
"ref_logps/chosen": -22.211511611938477,
"ref_logps/rejected": -26.38385772705078,
"rewards/accuracies": 0.59375,
"rewards/chosen": 0.015375516377389431,
"rewards/margins": 0.019868001341819763,
"rewards/rejected": -0.004492484033107758,
"step": 80
},
{
"epoch": 0.61,
"learning_rate": 4.4241573033707865e-07,
"logps/chosen": -22.327136993408203,
"logps/rejected": -27.90719985961914,
"loss": 0.6803,
"losses/dpo": 0.6830211281776428,
"losses/sft": 0.7352213263511658,
"losses/total": 0.6830211281776428,
"ref_logps/chosen": -22.457595825195312,
"ref_logps/rejected": -27.77078628540039,
"rewards/accuracies": 0.6484375,
"rewards/chosen": 0.013045946136116982,
"rewards/margins": 0.026687312871217728,
"rewards/rejected": -0.013641366735100746,
"step": 81
},
{
"epoch": 0.62,
"learning_rate": 4.410112359550562e-07,
"logps/chosen": -23.738140106201172,
"logps/rejected": -26.4810791015625,
"loss": 0.6802,
"losses/dpo": 0.673937976360321,
"losses/sft": 0.7962872385978699,
"losses/total": 0.673937976360321,
"ref_logps/chosen": -23.89459991455078,
"ref_logps/rejected": -26.366804122924805,
"rewards/accuracies": 0.65625,
"rewards/chosen": 0.015645721927285194,
"rewards/margins": 0.027073292061686516,
"rewards/rejected": -0.011427570134401321,
"step": 82
},
{
"epoch": 0.63,
"learning_rate": 4.3960674157303366e-07,
"logps/chosen": -21.008014678955078,
"logps/rejected": -24.34069061279297,
"loss": 0.6805,
"losses/dpo": 0.6789628863334656,
"losses/sft": 0.9124815464019775,
"losses/total": 0.6789628863334656,
"ref_logps/chosen": -21.115734100341797,
"ref_logps/rejected": -24.184371948242188,
"rewards/accuracies": 0.65625,
"rewards/chosen": 0.01077171228826046,
"rewards/margins": 0.026403725147247314,
"rewards/rejected": -0.015632012858986855,
"step": 83
},
{
"epoch": 0.63,
"learning_rate": 4.382022471910112e-07,
"logps/chosen": -20.62143325805664,
"logps/rejected": -26.963245391845703,
"loss": 0.6833,
"losses/dpo": 0.6907744407653809,
"losses/sft": 0.7639827728271484,
"losses/total": 0.6907744407653809,
"ref_logps/chosen": -20.739776611328125,
"ref_logps/rejected": -26.87374496459961,
"rewards/accuracies": 0.625,
"rewards/chosen": 0.011834252625703812,
"rewards/margins": 0.020784219726920128,
"rewards/rejected": -0.008949968963861465,
"step": 84
},
{
"epoch": 0.64,
"learning_rate": 4.367977528089887e-07,
"logps/chosen": -21.591964721679688,
"logps/rejected": -24.5494384765625,
"loss": 0.6846,
"losses/dpo": 0.6878204345703125,
"losses/sft": 0.6917088627815247,
"losses/total": 0.6878204345703125,
"ref_logps/chosen": -21.644916534423828,
"ref_logps/rejected": -24.421239852905273,
"rewards/accuracies": 0.6328125,
"rewards/chosen": 0.005295174196362495,
"rewards/margins": 0.01811503805220127,
"rewards/rejected": -0.012819863855838776,
"step": 85
},
{
"epoch": 0.65,
"learning_rate": 4.353932584269663e-07,
"logps/chosen": -24.759811401367188,
"logps/rejected": -28.227123260498047,
"loss": 0.6825,
"losses/dpo": 0.6937445402145386,
"losses/sft": 0.9424384832382202,
"losses/total": 0.6937445402145386,
"ref_logps/chosen": -24.891460418701172,
"ref_logps/rejected": -28.136310577392578,
"rewards/accuracies": 0.6484375,
"rewards/chosen": 0.013164759613573551,
"rewards/margins": 0.022245781496167183,
"rewards/rejected": -0.009081022813916206,
"step": 86
},
{
"epoch": 0.66,
"learning_rate": 4.339887640449438e-07,
"logps/chosen": -22.8006591796875,
"logps/rejected": -26.10009002685547,
"loss": 0.6795,
"losses/dpo": 0.6909404993057251,
"losses/sft": 0.8603497743606567,
"losses/total": 0.6909404993057251,
"ref_logps/chosen": -22.96673583984375,
"ref_logps/rejected": -25.977882385253906,
"rewards/accuracies": 0.7109375,
"rewards/chosen": 0.01660749316215515,
"rewards/margins": 0.028828214854002,
"rewards/rejected": -0.012220719829201698,
"step": 87
},
{
"epoch": 0.66,
"learning_rate": 4.3258426966292134e-07,
"logps/chosen": -24.15732765197754,
"logps/rejected": -28.13039779663086,
"loss": 0.6752,
"losses/dpo": 0.6638558506965637,
"losses/sft": 0.8455443382263184,
"losses/total": 0.6638558506965637,
"ref_logps/chosen": -24.341087341308594,
"ref_logps/rejected": -27.938106536865234,
"rewards/accuracies": 0.65625,
"rewards/chosen": 0.018375899642705917,
"rewards/margins": 0.03760489821434021,
"rewards/rejected": -0.019228998571634293,
"step": 88
},
{
"epoch": 0.67,
"learning_rate": 4.311797752808989e-07,
"logps/chosen": -21.290430068969727,
"logps/rejected": -25.207626342773438,
"loss": 0.6771,
"losses/dpo": 0.6774411797523499,
"losses/sft": 0.9257520437240601,
"losses/total": 0.6774411797523499,
"ref_logps/chosen": -21.449438095092773,
"ref_logps/rejected": -25.032873153686523,
"rewards/accuracies": 0.65625,
"rewards/chosen": 0.015900880098342896,
"rewards/margins": 0.033375710248947144,
"rewards/rejected": -0.017474830150604248,
"step": 89
},
{
"epoch": 0.68,
"learning_rate": 4.297752808988764e-07,
"logps/chosen": -24.241390228271484,
"logps/rejected": -27.57483673095703,
"loss": 0.681,
"losses/dpo": 0.6869298219680786,
"losses/sft": 0.8004887104034424,
"losses/total": 0.6869298219680786,
"ref_logps/chosen": -24.399887084960938,
"ref_logps/rejected": -27.475460052490234,
"rewards/accuracies": 0.640625,
"rewards/chosen": 0.015849877148866653,
"rewards/margins": 0.02578754723072052,
"rewards/rejected": -0.009937671013176441,
"step": 90
},
{
"epoch": 0.69,
"learning_rate": 4.2837078651685396e-07,
"logps/chosen": -21.290605545043945,
"logps/rejected": -25.188884735107422,
"loss": 0.6843,
"losses/dpo": 0.6896719336509705,
"losses/sft": 0.7865870594978333,
"losses/total": 0.6896719336509705,
"ref_logps/chosen": -21.39483642578125,
"ref_logps/rejected": -25.098407745361328,
"rewards/accuracies": 0.6328125,
"rewards/chosen": 0.010422902181744576,
"rewards/margins": 0.01947084441781044,
"rewards/rejected": -0.00904794316738844,
"step": 91
},
{
"epoch": 0.69,
"learning_rate": 4.269662921348314e-07,
"logps/chosen": -21.395389556884766,
"logps/rejected": -24.005056381225586,
"loss": 0.6863,
"losses/dpo": 0.6820717453956604,
"losses/sft": 0.8161361813545227,
"losses/total": 0.6820717453956604,
"ref_logps/chosen": -21.495037078857422,
"ref_logps/rejected": -23.9505558013916,
"rewards/accuracies": 0.53125,
"rewards/chosen": 0.009964808821678162,
"rewards/margins": 0.015414956025779247,
"rewards/rejected": -0.00545014813542366,
"step": 92
},
{
"epoch": 0.7,
"learning_rate": 4.2556179775280896e-07,
"logps/chosen": -20.948806762695312,
"logps/rejected": -24.735366821289062,
"loss": 0.6786,
"losses/dpo": 0.6868577599525452,
"losses/sft": 0.7177249193191528,
"losses/total": 0.6868577599525452,
"ref_logps/chosen": -21.072193145751953,
"ref_logps/rejected": -24.555286407470703,
"rewards/accuracies": 0.6953125,
"rewards/chosen": 0.012338603846728802,
"rewards/margins": 0.030346699059009552,
"rewards/rejected": -0.018008096143603325,
"step": 93
},
{
"epoch": 0.71,
"learning_rate": 4.2415730337078647e-07,
"logps/chosen": -24.245830535888672,
"logps/rejected": -28.811023712158203,
"loss": 0.6783,
"losses/dpo": 0.6721839904785156,
"losses/sft": 0.816402018070221,
"losses/total": 0.6721839904785156,
"ref_logps/chosen": -24.40906524658203,
"ref_logps/rejected": -28.65966033935547,
"rewards/accuracies": 0.703125,
"rewards/chosen": 0.01632346771657467,
"rewards/margins": 0.03146028146147728,
"rewards/rejected": -0.01513681747019291,
"step": 94
},
{
"epoch": 0.72,
"learning_rate": 4.22752808988764e-07,
"logps/chosen": -22.48372459411621,
"logps/rejected": -29.088359832763672,
"loss": 0.6709,
"losses/dpo": 0.6718644499778748,
"losses/sft": 0.823063313961029,
"losses/total": 0.6718644499778748,
"ref_logps/chosen": -22.634136199951172,
"ref_logps/rejected": -28.77379608154297,
"rewards/accuracies": 0.7109375,
"rewards/chosen": 0.01504128985106945,
"rewards/margins": 0.04649777710437775,
"rewards/rejected": -0.03145648539066315,
"step": 95
},
{
"epoch": 0.72,
"learning_rate": 4.2134831460674153e-07,
"logps/chosen": -20.869436264038086,
"logps/rejected": -27.790451049804688,
"loss": 0.6785,
"losses/dpo": 0.6842025518417358,
"losses/sft": 0.8330531120300293,
"losses/total": 0.6842025518417358,
"ref_logps/chosen": -20.964067459106445,
"ref_logps/rejected": -27.572711944580078,
"rewards/accuracies": 0.6484375,
"rewards/chosen": 0.009463240392506123,
"rewards/margins": 0.031237438321113586,
"rewards/rejected": -0.02177419885993004,
"step": 96
},
{
"epoch": 0.73,
"learning_rate": 4.199438202247191e-07,
"logps/chosen": -22.02164077758789,
"logps/rejected": -28.644880294799805,
"loss": 0.6772,
"losses/dpo": 0.6933009028434753,
"losses/sft": 0.7342395186424255,
"losses/total": 0.6933009028434753,
"ref_logps/chosen": -22.146129608154297,
"ref_logps/rejected": -28.42925262451172,
"rewards/accuracies": 0.640625,
"rewards/chosen": 0.012448801659047604,
"rewards/margins": 0.03401148319244385,
"rewards/rejected": -0.02156267873942852,
"step": 97
},
{
"epoch": 0.74,
"learning_rate": 4.1853932584269664e-07,
"logps/chosen": -21.086360931396484,
"logps/rejected": -23.74181365966797,
"loss": 0.6834,
"losses/dpo": 0.7061095833778381,
"losses/sft": 0.6976662278175354,
"losses/total": 0.7061095833778381,
"ref_logps/chosen": -21.240116119384766,
"ref_logps/rejected": -23.683523178100586,
"rewards/accuracies": 0.5625,
"rewards/chosen": 0.015375564806163311,
"rewards/margins": 0.021204624325037003,
"rewards/rejected": -0.0058290609158575535,
"step": 98
},
{
"epoch": 0.75,
"learning_rate": 4.1713483146067415e-07,
"logps/chosen": -21.535640716552734,
"logps/rejected": -28.555763244628906,
"loss": 0.6749,
"losses/dpo": 0.6546899080276489,
"losses/sft": 0.8132616281509399,
"losses/total": 0.6546899080276489,
"ref_logps/chosen": -21.68370819091797,
"ref_logps/rejected": -28.313589096069336,
"rewards/accuracies": 0.6640625,
"rewards/chosen": 0.014806646853685379,
"rewards/margins": 0.03902393952012062,
"rewards/rejected": -0.02421729266643524,
"step": 99
},
{
"epoch": 0.75,
"learning_rate": 4.157303370786517e-07,
"logps/chosen": -22.314010620117188,
"logps/rejected": -26.403512954711914,
"loss": 0.6777,
"losses/dpo": 0.6830233931541443,
"losses/sft": 0.7298552393913269,
"losses/total": 0.6830233931541443,
"ref_logps/chosen": -22.442527770996094,
"ref_logps/rejected": -26.1983699798584,
"rewards/accuracies": 0.625,
"rewards/chosen": 0.012851729989051819,
"rewards/margins": 0.033365827053785324,
"rewards/rejected": -0.020514097064733505,
"step": 100
},
{
"epoch": 0.76,
"learning_rate": 4.1432584269662915e-07,
"logps/chosen": -23.65606117248535,
"logps/rejected": -27.6639461517334,
"loss": 0.6787,
"losses/dpo": 0.66861492395401,
"losses/sft": 0.7538549900054932,
"losses/total": 0.66861492395401,
"ref_logps/chosen": -23.742881774902344,
"ref_logps/rejected": -27.43739128112793,
"rewards/accuracies": 0.6328125,
"rewards/chosen": 0.008682135492563248,
"rewards/margins": 0.03133738413453102,
"rewards/rejected": -0.022655250504612923,
"step": 101
},
{
"epoch": 0.77,
"learning_rate": 4.129213483146067e-07,
"logps/chosen": -21.20174789428711,
"logps/rejected": -27.045516967773438,
"loss": 0.6736,
"losses/dpo": 0.6594799757003784,
"losses/sft": 0.7625675201416016,
"losses/total": 0.6594799757003784,
"ref_logps/chosen": -21.360929489135742,
"ref_logps/rejected": -26.788671493530273,
"rewards/accuracies": 0.6640625,
"rewards/chosen": 0.015918483957648277,
"rewards/margins": 0.04160304740071297,
"rewards/rejected": -0.02568456158041954,
"step": 102
},
{
"epoch": 0.78,
"learning_rate": 4.115168539325842e-07,
"logps/chosen": -25.287567138671875,
"logps/rejected": -27.158187866210938,
"loss": 0.6789,
"losses/dpo": 0.6871756315231323,
"losses/sft": 0.7897288799285889,
"losses/total": 0.6871756315231323,
"ref_logps/chosen": -25.39737319946289,
"ref_logps/rejected": -26.95665740966797,
"rewards/accuracies": 0.6015625,
"rewards/chosen": 0.010980643332004547,
"rewards/margins": 0.03113364614546299,
"rewards/rejected": -0.020153000950813293,
"step": 103
},
{
"epoch": 0.78,
"learning_rate": 4.1011235955056177e-07,
"logps/chosen": -20.239051818847656,
"logps/rejected": -27.055557250976562,
"loss": 0.6766,
"losses/dpo": 0.6560062170028687,
"losses/sft": 0.7211654186248779,
"losses/total": 0.6560062170028687,
"ref_logps/chosen": -20.345287322998047,
"ref_logps/rejected": -26.804546356201172,
"rewards/accuracies": 0.640625,
"rewards/chosen": 0.010623706504702568,
"rewards/margins": 0.03572461009025574,
"rewards/rejected": -0.02510090172290802,
"step": 104
},
{
"epoch": 0.79,
"learning_rate": 4.0870786516853933e-07,
"logps/chosen": -22.816429138183594,
"logps/rejected": -28.331439971923828,
"loss": 0.6728,
"losses/dpo": 0.6975245475769043,
"losses/sft": 0.8287545442581177,
"losses/total": 0.6975245475769043,
"ref_logps/chosen": -22.96261215209961,
"ref_logps/rejected": -28.04006576538086,
"rewards/accuracies": 0.7109375,
"rewards/chosen": 0.014618270099163055,
"rewards/margins": 0.043755702674388885,
"rewards/rejected": -0.02913743630051613,
"step": 105
},
{
"epoch": 0.8,
"learning_rate": 4.0730337078651683e-07,
"logps/chosen": -22.864845275878906,
"logps/rejected": -27.868162155151367,
"loss": 0.6776,
"losses/dpo": 0.6524635553359985,
"losses/sft": 0.8967273235321045,
"losses/total": 0.6524635553359985,
"ref_logps/chosen": -22.934465408325195,
"ref_logps/rejected": -27.60092544555664,
"rewards/accuracies": 0.65625,
"rewards/chosen": 0.006961943581700325,
"rewards/margins": 0.03368568420410156,
"rewards/rejected": -0.026723740622401237,
"step": 106
},
{
"epoch": 0.81,
"learning_rate": 4.058988764044944e-07,
"logps/chosen": -26.633420944213867,
"logps/rejected": -29.40836524963379,
"loss": 0.6785,
"losses/dpo": 0.6883168816566467,
"losses/sft": 0.9007142782211304,
"losses/total": 0.6883168816566467,
"ref_logps/chosen": -26.658733367919922,
"ref_logps/rejected": -29.11638641357422,
"rewards/accuracies": 0.625,
"rewards/chosen": 0.0025312139187008142,
"rewards/margins": 0.031729087233543396,
"rewards/rejected": -0.029197873547673225,
"step": 107
},
{
"epoch": 0.82,
"learning_rate": 4.044943820224719e-07,
"logps/chosen": -21.93716049194336,
"logps/rejected": -26.78734016418457,
"loss": 0.6678,
"losses/dpo": 0.6620572805404663,
"losses/sft": 0.7277075052261353,
"losses/total": 0.6620572805404663,
"ref_logps/chosen": -22.14274787902832,
"ref_logps/rejected": -26.450454711914062,
"rewards/accuracies": 0.7265625,
"rewards/chosen": 0.02055862732231617,
"rewards/margins": 0.05424723029136658,
"rewards/rejected": -0.03368859738111496,
"step": 108
},
{
"epoch": 0.82,
"learning_rate": 4.0308988764044945e-07,
"logps/chosen": -23.479236602783203,
"logps/rejected": -25.321468353271484,
"loss": 0.6732,
"losses/dpo": 0.6536136865615845,
"losses/sft": 0.793202817440033,
"losses/total": 0.6536136865615845,
"ref_logps/chosen": -23.628402709960938,
"ref_logps/rejected": -25.03476905822754,
"rewards/accuracies": 0.6171875,
"rewards/chosen": 0.014916517771780491,
"rewards/margins": 0.043586596846580505,
"rewards/rejected": -0.02867007628083229,
"step": 109
},
{
"epoch": 0.83,
"learning_rate": 4.0168539325842696e-07,
"logps/chosen": -21.36187744140625,
"logps/rejected": -26.808046340942383,
"loss": 0.6677,
"losses/dpo": 0.658541202545166,
"losses/sft": 0.6240718364715576,
"losses/total": 0.658541202545166,
"ref_logps/chosen": -21.525625228881836,
"ref_logps/rejected": -26.417198181152344,
"rewards/accuracies": 0.640625,
"rewards/chosen": 0.01637459173798561,
"rewards/margins": 0.05545924976468086,
"rewards/rejected": -0.03908466175198555,
"step": 110
},
{
"epoch": 0.84,
"learning_rate": 4.0028089887640446e-07,
"logps/chosen": -22.143728256225586,
"logps/rejected": -26.035858154296875,
"loss": 0.6732,
"losses/dpo": 0.6707695126533508,
"losses/sft": 0.8353971838951111,
"losses/total": 0.6707695126533508,
"ref_logps/chosen": -22.292274475097656,
"ref_logps/rejected": -25.744632720947266,
"rewards/accuracies": 0.6484375,
"rewards/chosen": 0.014854478649795055,
"rewards/margins": 0.043976958841085434,
"rewards/rejected": -0.029122481122612953,
"step": 111
},
{
"epoch": 0.85,
"learning_rate": 3.9887640449438196e-07,
"logps/chosen": -22.15041732788086,
"logps/rejected": -24.53826332092285,
"loss": 0.6688,
"losses/dpo": 0.6656994819641113,
"losses/sft": 0.8727293014526367,
"losses/total": 0.6656994819641113,
"ref_logps/chosen": -22.210494995117188,
"ref_logps/rejected": -24.07231330871582,
"rewards/accuracies": 0.7109375,
"rewards/chosen": 0.0060077933594584465,
"rewards/margins": 0.05260289087891579,
"rewards/rejected": -0.046595096588134766,
"step": 112
},
{
"epoch": 0.85,
"learning_rate": 3.974719101123595e-07,
"logps/chosen": -23.314592361450195,
"logps/rejected": -27.797752380371094,
"loss": 0.675,
"losses/dpo": 0.6690158247947693,
"losses/sft": 0.7370929718017578,
"losses/total": 0.6690158247947693,
"ref_logps/chosen": -23.396080017089844,
"ref_logps/rejected": -27.483016967773438,
"rewards/accuracies": 0.609375,
"rewards/chosen": 0.008148876950144768,
"rewards/margins": 0.03962232545018196,
"rewards/rejected": -0.031473446637392044,
"step": 113
},
{
"epoch": 0.86,
"learning_rate": 3.960674157303371e-07,
"logps/chosen": -21.854373931884766,
"logps/rejected": -26.652328491210938,
"loss": 0.6706,
"losses/dpo": 0.645140528678894,
"losses/sft": 0.77164226770401,
"losses/total": 0.645140528678894,
"ref_logps/chosen": -21.949893951416016,
"ref_logps/rejected": -26.255746841430664,
"rewards/accuracies": 0.6484375,
"rewards/chosen": 0.009551877155900002,
"rewards/margins": 0.0492100827395916,
"rewards/rejected": -0.03965820372104645,
"step": 114
},
{
"epoch": 0.87,
"learning_rate": 3.946629213483146e-07,
"logps/chosen": -23.778413772583008,
"logps/rejected": -28.40381622314453,
"loss": 0.6634,
"losses/dpo": 0.6699668169021606,
"losses/sft": 0.8002771139144897,
"losses/total": 0.6699668169021606,
"ref_logps/chosen": -23.903501510620117,
"ref_logps/rejected": -27.89557647705078,
"rewards/accuracies": 0.71875,
"rewards/chosen": 0.0125090591609478,
"rewards/margins": 0.06333282589912415,
"rewards/rejected": -0.050823770463466644,
"step": 115
},
{
"epoch": 0.88,
"learning_rate": 3.9325842696629214e-07,
"logps/chosen": -24.59353256225586,
"logps/rejected": -28.184139251708984,
"loss": 0.6658,
"losses/dpo": 0.6745936870574951,
"losses/sft": 0.8017398715019226,
"losses/total": 0.6745936870574951,
"ref_logps/chosen": -24.77825164794922,
"ref_logps/rejected": -27.77378273010254,
"rewards/accuracies": 0.671875,
"rewards/chosen": 0.018472209572792053,
"rewards/margins": 0.059507861733436584,
"rewards/rejected": -0.04103565216064453,
"step": 116
},
{
"epoch": 0.88,
"learning_rate": 3.9185393258426964e-07,
"logps/chosen": -20.781490325927734,
"logps/rejected": -25.704240798950195,
"loss": 0.6641,
"losses/dpo": 0.6748782396316528,
"losses/sft": 0.6509857177734375,
"losses/total": 0.6748782396316528,
"ref_logps/chosen": -20.93104362487793,
"ref_logps/rejected": -25.223262786865234,
"rewards/accuracies": 0.703125,
"rewards/chosen": 0.014955190010368824,
"rewards/margins": 0.06305292248725891,
"rewards/rejected": -0.04809773340821266,
"step": 117
},
{
"epoch": 0.89,
"learning_rate": 3.904494382022472e-07,
"logps/chosen": -22.889171600341797,
"logps/rejected": -28.954145431518555,
"loss": 0.6719,
"losses/dpo": 0.6790695190429688,
"losses/sft": 0.7899962663650513,
"losses/total": 0.6790695190429688,
"ref_logps/chosen": -22.998294830322266,
"ref_logps/rejected": -28.596576690673828,
"rewards/accuracies": 0.640625,
"rewards/chosen": 0.010911967605352402,
"rewards/margins": 0.04666893184185028,
"rewards/rejected": -0.03575696796178818,
"step": 118
},
{
"epoch": 0.9,
"learning_rate": 3.890449438202247e-07,
"logps/chosen": -22.229143142700195,
"logps/rejected": -24.892658233642578,
"loss": 0.6713,
"losses/dpo": 0.6665077209472656,
"losses/sft": 0.8753491044044495,
"losses/total": 0.6665077209472656,
"ref_logps/chosen": -22.402416229248047,
"ref_logps/rejected": -24.568809509277344,
"rewards/accuracies": 0.6484375,
"rewards/chosen": 0.01732712611556053,
"rewards/margins": 0.0497119314968586,
"rewards/rejected": -0.032384805381298065,
"step": 119
},
{
"epoch": 0.91,
"learning_rate": 3.876404494382022e-07,
"logps/chosen": -22.233783721923828,
"logps/rejected": -29.53872299194336,
"loss": 0.6637,
"losses/dpo": 0.6545946002006531,
"losses/sft": 0.8056938052177429,
"losses/total": 0.6545946002006531,
"ref_logps/chosen": -22.328821182250977,
"ref_logps/rejected": -28.996824264526367,
"rewards/accuracies": 0.7265625,
"rewards/chosen": 0.009503833949565887,
"rewards/margins": 0.06369376927614212,
"rewards/rejected": -0.05418993532657623,
"step": 120
},
{
"epoch": 0.91,
"learning_rate": 3.8623595505617977e-07,
"logps/chosen": -24.073867797851562,
"logps/rejected": -27.632476806640625,
"loss": 0.6778,
"losses/dpo": 0.6500009298324585,
"losses/sft": 0.9210071563720703,
"losses/total": 0.6500009298324585,
"ref_logps/chosen": -24.12955093383789,
"ref_logps/rejected": -27.32662582397461,
"rewards/accuracies": 0.5859375,
"rewards/chosen": 0.00556858628988266,
"rewards/margins": 0.03615354374051094,
"rewards/rejected": -0.03058495745062828,
"step": 121
},
{
"epoch": 0.92,
"learning_rate": 3.8483146067415727e-07,
"logps/chosen": -21.38442039489746,
"logps/rejected": -31.358665466308594,
"loss": 0.6601,
"losses/dpo": 0.6630659103393555,
"losses/sft": 0.8758641481399536,
"losses/total": 0.6630659103393555,
"ref_logps/chosen": -21.540292739868164,
"ref_logps/rejected": -30.79846954345703,
"rewards/accuracies": 0.703125,
"rewards/chosen": 0.01558714546263218,
"rewards/margins": 0.07160677015781403,
"rewards/rejected": -0.056019626557826996,
"step": 122
},
{
"epoch": 0.93,
"learning_rate": 3.834269662921348e-07,
"logps/chosen": -21.09262466430664,
"logps/rejected": -25.64166831970215,
"loss": 0.6622,
"losses/dpo": 0.6400080919265747,
"losses/sft": 0.8849148750305176,
"losses/total": 0.6400080919265747,
"ref_logps/chosen": -21.179445266723633,
"ref_logps/rejected": -25.056869506835938,
"rewards/accuracies": 0.6796875,
"rewards/chosen": 0.00868179090321064,
"rewards/margins": 0.06716156005859375,
"rewards/rejected": -0.05847976729273796,
"step": 123
},
{
"epoch": 0.94,
"learning_rate": 3.8202247191011233e-07,
"logps/chosen": -25.65859603881836,
"logps/rejected": -28.025104522705078,
"loss": 0.6765,
"losses/dpo": 0.6927012205123901,
"losses/sft": 0.8673559427261353,
"losses/total": 0.6927012205123901,
"ref_logps/chosen": -25.61608123779297,
"ref_logps/rejected": -27.600624084472656,
"rewards/accuracies": 0.59375,
"rewards/chosen": -0.004251426085829735,
"rewards/margins": 0.038196537643671036,
"rewards/rejected": -0.04244796186685562,
"step": 124
},
{
"epoch": 0.94,
"learning_rate": 3.806179775280899e-07,
"logps/chosen": -23.93341636657715,
"logps/rejected": -29.840375900268555,
"loss": 0.6647,
"losses/dpo": 0.7150436639785767,
"losses/sft": 0.9468034505844116,
"losses/total": 0.7150436639785767,
"ref_logps/chosen": -23.979652404785156,
"ref_logps/rejected": -29.25320816040039,
"rewards/accuracies": 0.703125,
"rewards/chosen": 0.004623853601515293,
"rewards/margins": 0.06334076821804047,
"rewards/rejected": -0.058716922998428345,
"step": 125
},
{
"epoch": 0.95,
"learning_rate": 3.792134831460674e-07,
"logps/chosen": -25.031259536743164,
"logps/rejected": -28.292198181152344,
"loss": 0.6559,
"losses/dpo": 0.6770719289779663,
"losses/sft": 0.9255229234695435,
"losses/total": 0.6770719289779663,
"ref_logps/chosen": -25.22754669189453,
"ref_logps/rejected": -27.667905807495117,
"rewards/accuracies": 0.7265625,
"rewards/chosen": 0.019628863781690598,
"rewards/margins": 0.08205802738666534,
"rewards/rejected": -0.06242916360497475,
"step": 126
},
{
"epoch": 0.96,
"learning_rate": 3.7780898876404495e-07,
"logps/chosen": -21.68558692932129,
"logps/rejected": -26.84676742553711,
"loss": 0.6765,
"losses/dpo": 0.635480523109436,
"losses/sft": 0.7413178086280823,
"losses/total": 0.635480523109436,
"ref_logps/chosen": -21.638694763183594,
"ref_logps/rejected": -26.392860412597656,
"rewards/accuracies": 0.5390625,
"rewards/chosen": -0.004689330700784922,
"rewards/margins": 0.04070135951042175,
"rewards/rejected": -0.04539068788290024,
"step": 127
},
{
"epoch": 0.97,
"learning_rate": 3.7640449438202245e-07,
"logps/chosen": -22.910152435302734,
"logps/rejected": -26.53976821899414,
"loss": 0.6587,
"losses/dpo": 0.6835530400276184,
"losses/sft": 0.9732310771942139,
"losses/total": 0.6835530400276184,
"ref_logps/chosen": -23.018016815185547,
"ref_logps/rejected": -25.88375473022461,
"rewards/accuracies": 0.703125,
"rewards/chosen": 0.010786494240164757,
"rewards/margins": 0.07638738304376602,
"rewards/rejected": -0.06560088694095612,
"step": 128
},
{
"epoch": 0.97,
"learning_rate": 3.75e-07,
"logps/chosen": -23.20888900756836,
"logps/rejected": -26.875211715698242,
"loss": 0.6617,
"losses/dpo": 0.6463422775268555,
"losses/sft": 0.7454620599746704,
"losses/total": 0.6463422775268555,
"ref_logps/chosen": -23.336442947387695,
"ref_logps/rejected": -26.300058364868164,
"rewards/accuracies": 0.6875,
"rewards/chosen": 0.012755412608385086,
"rewards/margins": 0.0702708438038826,
"rewards/rejected": -0.057515427470207214,
"step": 129
},
{
"epoch": 0.98,
"learning_rate": 3.735955056179775e-07,
"logps/chosen": -22.396747589111328,
"logps/rejected": -29.472164154052734,
"loss": 0.6784,
"losses/dpo": 0.6625787019729614,
"losses/sft": 0.7854889631271362,
"losses/total": 0.6625787019729614,
"ref_logps/chosen": -22.277332305908203,
"ref_logps/rejected": -28.998043060302734,
"rewards/accuracies": 0.6328125,
"rewards/chosen": -0.01194157637655735,
"rewards/margins": 0.03547064587473869,
"rewards/rejected": -0.047412216663360596,
"step": 130
},
{
"epoch": 0.99,
"learning_rate": 3.72191011235955e-07,
"logps/chosen": -18.81739044189453,
"logps/rejected": -24.600296020507812,
"loss": 0.6612,
"losses/dpo": 0.6598723530769348,
"losses/sft": 0.8644169569015503,
"losses/total": 0.6598723530769348,
"ref_logps/chosen": -18.89391326904297,
"ref_logps/rejected": -23.96309471130371,
"rewards/accuracies": 0.671875,
"rewards/chosen": 0.0076522137969732285,
"rewards/margins": 0.07137227803468704,
"rewards/rejected": -0.06372006982564926,
"step": 131
},
{
"epoch": 1.0,
"learning_rate": 3.707865168539326e-07,
"logps/chosen": -25.24700927734375,
"logps/rejected": -29.2607364654541,
"loss": 0.6576,
"losses/dpo": 0.6264052391052246,
"losses/sft": 0.7484258413314819,
"losses/total": 0.6264052391052246,
"ref_logps/chosen": -25.243091583251953,
"ref_logps/rejected": -28.458097457885742,
"rewards/accuracies": 0.65625,
"rewards/chosen": -0.00039180926978588104,
"rewards/margins": 0.07987209409475327,
"rewards/rejected": -0.0802639052271843,
"step": 132
},
{
"epoch": 1.0,
"learning_rate": 3.693820224719101e-07,
"logps/chosen": -24.664264678955078,
"logps/rejected": -29.071331024169922,
"loss": 0.6596,
"losses/dpo": 0.6850643157958984,
"losses/sft": 0.7063156366348267,
"losses/total": 0.6850643157958984,
"ref_logps/chosen": -24.58011245727539,
"ref_logps/rejected": -28.208541870117188,
"rewards/accuracies": 0.71875,
"rewards/chosen": -0.008415229618549347,
"rewards/margins": 0.07786377519369125,
"rewards/rejected": -0.08627899736166,
"step": 133
},
{
"epoch": 1.01,
"learning_rate": 3.6797752808988764e-07,
"logps/chosen": -21.803192138671875,
"logps/rejected": -25.79207992553711,
"loss": 0.6529,
"losses/dpo": 0.6567816734313965,
"losses/sft": 0.8528650403022766,
"losses/total": 0.6567816734313965,
"ref_logps/chosen": -21.88966941833496,
"ref_logps/rejected": -24.97705841064453,
"rewards/accuracies": 0.6875,
"rewards/chosen": 0.00864771381020546,
"rewards/margins": 0.0901501327753067,
"rewards/rejected": -0.08150242269039154,
"step": 134
},
{
"epoch": 1.02,
"learning_rate": 3.6657303370786514e-07,
"logps/chosen": -20.78626823425293,
"logps/rejected": -27.048810958862305,
"loss": 0.6442,
"losses/dpo": 0.6402660608291626,
"losses/sft": 0.7653439044952393,
"losses/total": 0.6402660608291626,
"ref_logps/chosen": -20.915481567382812,
"ref_logps/rejected": -26.105587005615234,
"rewards/accuracies": 0.765625,
"rewards/chosen": 0.012921325862407684,
"rewards/margins": 0.10724389553070068,
"rewards/rejected": -0.0943225771188736,
"step": 135
},
{
"epoch": 1.03,
"learning_rate": 3.651685393258427e-07,
"logps/chosen": -23.661598205566406,
"logps/rejected": -26.884532928466797,
"loss": 0.6563,
"losses/dpo": 0.6588989496231079,
"losses/sft": 0.8334387540817261,
"losses/total": 0.6588989496231079,
"ref_logps/chosen": -23.68170166015625,
"ref_logps/rejected": -26.042449951171875,
"rewards/accuracies": 0.671875,
"rewards/chosen": 0.0020102611742913723,
"rewards/margins": 0.08621874451637268,
"rewards/rejected": -0.08420848101377487,
"step": 136
},
{
"epoch": 1.03,
"learning_rate": 3.637640449438202e-07,
"logps/chosen": -21.846914291381836,
"logps/rejected": -26.843595504760742,
"loss": 0.6414,
"losses/dpo": 0.610801100730896,
"losses/sft": 0.6104759573936462,
"losses/total": 0.610801100730896,
"ref_logps/chosen": -21.904037475585938,
"ref_logps/rejected": -25.758628845214844,
"rewards/accuracies": 0.7734375,
"rewards/chosen": 0.005712391808629036,
"rewards/margins": 0.11420895159244537,
"rewards/rejected": -0.10849656164646149,
"step": 137
},
{
"epoch": 1.04,
"learning_rate": 3.6235955056179776e-07,
"logps/chosen": -23.79953384399414,
"logps/rejected": -26.24932861328125,
"loss": 0.6507,
"losses/dpo": 0.6711180806159973,
"losses/sft": 0.8334028720855713,
"losses/total": 0.6711180806159973,
"ref_logps/chosen": -23.89289093017578,
"ref_logps/rejected": -25.393817901611328,
"rewards/accuracies": 0.6953125,
"rewards/chosen": 0.009335671551525593,
"rewards/margins": 0.09488671272993088,
"rewards/rejected": -0.08555103838443756,
"step": 138
},
{
"epoch": 1.05,
"learning_rate": 3.6095505617977526e-07,
"logps/chosen": -20.413612365722656,
"logps/rejected": -28.091732025146484,
"loss": 0.6393,
"losses/dpo": 0.6086191534996033,
"losses/sft": 0.7045127749443054,
"losses/total": 0.6086191534996033,
"ref_logps/chosen": -20.591529846191406,
"ref_logps/rejected": -27.054677963256836,
"rewards/accuracies": 0.703125,
"rewards/chosen": 0.0177919864654541,
"rewards/margins": 0.12149728834629059,
"rewards/rejected": -0.10370529443025589,
"step": 139
},
{
"epoch": 1.06,
"learning_rate": 3.5955056179775277e-07,
"logps/chosen": -23.96946907043457,
"logps/rejected": -25.42624282836914,
"loss": 0.6574,
"losses/dpo": 0.6771029233932495,
"losses/sft": 0.8275946378707886,
"losses/total": 0.6771029233932495,
"ref_logps/chosen": -23.84187126159668,
"ref_logps/rejected": -24.474294662475586,
"rewards/accuracies": 0.703125,
"rewards/chosen": -0.012759597972035408,
"rewards/margins": 0.0824354737997055,
"rewards/rejected": -0.09519506990909576,
"step": 140
},
{
"epoch": 1.06,
"learning_rate": 3.581460674157303e-07,
"logps/chosen": -20.24493980407715,
"logps/rejected": -26.33192253112793,
"loss": 0.6403,
"losses/dpo": 0.60587477684021,
"losses/sft": 0.7718257904052734,
"losses/total": 0.60587477684021,
"ref_logps/chosen": -20.375638961791992,
"ref_logps/rejected": -25.299020767211914,
"rewards/accuracies": 0.75,
"rewards/chosen": 0.013069930486381054,
"rewards/margins": 0.11636004596948624,
"rewards/rejected": -0.10329011082649231,
"step": 141
},
{
"epoch": 1.07,
"learning_rate": 3.5674157303370783e-07,
"logps/chosen": -22.9414119720459,
"logps/rejected": -28.200380325317383,
"loss": 0.6384,
"losses/dpo": 0.6827423572540283,
"losses/sft": 0.8567611575126648,
"losses/total": 0.6827423572540283,
"ref_logps/chosen": -23.111347198486328,
"ref_logps/rejected": -27.142616271972656,
"rewards/accuracies": 0.7421875,
"rewards/chosen": 0.01699351891875267,
"rewards/margins": 0.12276984751224518,
"rewards/rejected": -0.10577632486820221,
"step": 142
},
{
"epoch": 1.08,
"learning_rate": 3.553370786516854e-07,
"logps/chosen": -23.226070404052734,
"logps/rejected": -27.77198028564453,
"loss": 0.6624,
"losses/dpo": 0.6864386796951294,
"losses/sft": 0.8041479587554932,
"losses/total": 0.6864386796951294,
"ref_logps/chosen": -22.92740249633789,
"ref_logps/rejected": -26.72946548461914,
"rewards/accuracies": 0.625,
"rewards/chosen": -0.029866419732570648,
"rewards/margins": 0.07438516616821289,
"rewards/rejected": -0.10425157845020294,
"step": 143
},
{
"epoch": 1.09,
"learning_rate": 3.539325842696629e-07,
"logps/chosen": -21.75617027282715,
"logps/rejected": -28.53704833984375,
"loss": 0.6455,
"losses/dpo": 0.6347097158432007,
"losses/sft": 0.6569658517837524,
"losses/total": 0.6347097158432007,
"ref_logps/chosen": -21.872474670410156,
"ref_logps/rejected": -27.540082931518555,
"rewards/accuracies": 0.7109375,
"rewards/chosen": 0.011630430817604065,
"rewards/margins": 0.11132718622684479,
"rewards/rejected": -0.09969674795866013,
"step": 144
},
{
"epoch": 1.09,
"learning_rate": 3.5252808988764045e-07,
"logps/chosen": -24.18975830078125,
"logps/rejected": -29.736862182617188,
"loss": 0.6407,
"losses/dpo": 0.6530706286430359,
"losses/sft": 0.8703383207321167,
"losses/total": 0.6530706286430359,
"ref_logps/chosen": -24.138484954833984,
"ref_logps/rejected": -28.495933532714844,
"rewards/accuracies": 0.71875,
"rewards/chosen": -0.005126964300870895,
"rewards/margins": 0.11896562576293945,
"rewards/rejected": -0.12409258633852005,
"step": 145
},
{
"epoch": 1.1,
"learning_rate": 3.51123595505618e-07,
"logps/chosen": -24.84428596496582,
"logps/rejected": -29.576303482055664,
"loss": 0.647,
"losses/dpo": 0.6477080583572388,
"losses/sft": 0.8653473854064941,
"losses/total": 0.6477080583572388,
"ref_logps/chosen": -24.755064010620117,
"ref_logps/rejected": -28.43872833251953,
"rewards/accuracies": 0.7265625,
"rewards/chosen": -0.008922239765524864,
"rewards/margins": 0.10483534634113312,
"rewards/rejected": -0.11375758051872253,
"step": 146
},
{
"epoch": 1.11,
"learning_rate": 3.497191011235955e-07,
"logps/chosen": -24.983165740966797,
"logps/rejected": -27.753063201904297,
"loss": 0.6095,
"losses/dpo": 0.6273882389068604,
"losses/sft": 0.8987213373184204,
"losses/total": 0.6273882389068604,
"ref_logps/chosen": -25.17366600036621,
"ref_logps/rejected": -26.05366325378418,
"rewards/accuracies": 0.8125,
"rewards/chosen": 0.019050076603889465,
"rewards/margins": 0.1889900416135788,
"rewards/rejected": -0.16993993520736694,
"step": 147
},
{
"epoch": 1.12,
"learning_rate": 3.48314606741573e-07,
"logps/chosen": -22.61692237854004,
"logps/rejected": -27.743179321289062,
"loss": 0.6583,
"losses/dpo": 0.6790063381195068,
"losses/sft": 0.7648496627807617,
"losses/total": 0.6790063381195068,
"ref_logps/chosen": -22.40664291381836,
"ref_logps/rejected": -26.67925262451172,
"rewards/accuracies": 0.671875,
"rewards/chosen": -0.02102772891521454,
"rewards/margins": 0.08536479622125626,
"rewards/rejected": -0.1063925176858902,
"step": 148
},
{
"epoch": 1.12,
"learning_rate": 3.469101123595505e-07,
"logps/chosen": -22.846782684326172,
"logps/rejected": -29.590002059936523,
"loss": 0.6261,
"losses/dpo": 0.6479306221008301,
"losses/sft": 0.8049210906028748,
"losses/total": 0.6479306221008301,
"ref_logps/chosen": -23.011579513549805,
"ref_logps/rejected": -28.2562198638916,
"rewards/accuracies": 0.7734375,
"rewards/chosen": 0.016479745507240295,
"rewards/margins": 0.14985813200473785,
"rewards/rejected": -0.13337840139865875,
"step": 149
},
{
"epoch": 1.13,
"learning_rate": 3.4550561797752807e-07,
"logps/chosen": -21.699583053588867,
"logps/rejected": -27.46141815185547,
"loss": 0.6277,
"losses/dpo": 0.6358213424682617,
"losses/sft": 0.8344307541847229,
"losses/total": 0.6358213424682617,
"ref_logps/chosen": -21.698383331298828,
"ref_logps/rejected": -25.974313735961914,
"rewards/accuracies": 0.7734375,
"rewards/chosen": -0.00012012943625450134,
"rewards/margins": 0.14859014749526978,
"rewards/rejected": -0.14871028065681458,
"step": 150
},
{
"epoch": 1.14,
"learning_rate": 3.441011235955056e-07,
"logps/chosen": -20.88718032836914,
"logps/rejected": -25.436817169189453,
"loss": 0.654,
"losses/dpo": 0.6406779289245605,
"losses/sft": 0.8018806576728821,
"losses/total": 0.6406779289245605,
"ref_logps/chosen": -20.712448120117188,
"ref_logps/rejected": -24.37557029724121,
"rewards/accuracies": 0.671875,
"rewards/chosen": -0.01747327297925949,
"rewards/margins": 0.08865140378475189,
"rewards/rejected": -0.10612466931343079,
"step": 151
},
{
"epoch": 1.15,
"learning_rate": 3.4269662921348313e-07,
"logps/chosen": -22.312236785888672,
"logps/rejected": -30.142927169799805,
"loss": 0.6355,
"losses/dpo": 0.5932921171188354,
"losses/sft": 0.6528638005256653,
"losses/total": 0.5932921171188354,
"ref_logps/chosen": -22.210554122924805,
"ref_logps/rejected": -28.713180541992188,
"rewards/accuracies": 0.703125,
"rewards/chosen": -0.010168392211198807,
"rewards/margins": 0.13280624151229858,
"rewards/rejected": -0.1429746299982071,
"step": 152
},
{
"epoch": 1.15,
"learning_rate": 3.4129213483146064e-07,
"logps/chosen": -26.28810691833496,
"logps/rejected": -29.10406494140625,
"loss": 0.6359,
"losses/dpo": 0.6205468773841858,
"losses/sft": 0.8744308352470398,
"losses/total": 0.6205468773841858,
"ref_logps/chosen": -26.150360107421875,
"ref_logps/rejected": -27.655288696289062,
"rewards/accuracies": 0.7890625,
"rewards/chosen": -0.013774631544947624,
"rewards/margins": 0.1311032772064209,
"rewards/rejected": -0.14487791061401367,
"step": 153
},
{
"epoch": 1.16,
"learning_rate": 3.398876404494382e-07,
"logps/chosen": -22.283679962158203,
"logps/rejected": -26.302614212036133,
"loss": 0.6679,
"losses/dpo": 0.6655905246734619,
"losses/sft": 0.8864909410476685,
"losses/total": 0.6655905246734619,
"ref_logps/chosen": -21.867923736572266,
"ref_logps/rejected": -25.230295181274414,
"rewards/accuracies": 0.6484375,
"rewards/chosen": -0.04157543182373047,
"rewards/margins": 0.0656563863158226,
"rewards/rejected": -0.10723182559013367,
"step": 154
},
{
"epoch": 1.17,
"learning_rate": 3.3848314606741575e-07,
"logps/chosen": -22.68756103515625,
"logps/rejected": -28.45652961730957,
"loss": 0.6559,
"losses/dpo": 0.6645406484603882,
"losses/sft": 0.794353723526001,
"losses/total": 0.6645406484603882,
"ref_logps/chosen": -22.445241928100586,
"ref_logps/rejected": -27.26552963256836,
"rewards/accuracies": 0.6328125,
"rewards/chosen": -0.024232013151049614,
"rewards/margins": 0.09486782550811768,
"rewards/rejected": -0.11909983307123184,
"step": 155
},
{
"epoch": 1.18,
"learning_rate": 3.3707865168539325e-07,
"logps/chosen": -22.336397171020508,
"logps/rejected": -27.09580421447754,
"loss": 0.6194,
"losses/dpo": 0.5837043523788452,
"losses/sft": 0.9716494083404541,
"losses/total": 0.5837043523788452,
"ref_logps/chosen": -22.35472297668457,
"ref_logps/rejected": -25.422115325927734,
"rewards/accuracies": 0.7734375,
"rewards/chosen": 0.001832372508943081,
"rewards/margins": 0.16920123994350433,
"rewards/rejected": -0.16736885905265808,
"step": 156
},
{
"epoch": 1.18,
"learning_rate": 3.356741573033708e-07,
"logps/chosen": -22.49996566772461,
"logps/rejected": -28.435253143310547,
"loss": 0.6425,
"losses/dpo": 0.6696836948394775,
"losses/sft": 0.773880660533905,
"losses/total": 0.6696836948394775,
"ref_logps/chosen": -22.170368194580078,
"ref_logps/rejected": -26.929363250732422,
"rewards/accuracies": 0.75,
"rewards/chosen": -0.032960131764411926,
"rewards/margins": 0.11762877553701401,
"rewards/rejected": -0.15058889985084534,
"step": 157
},
{
"epoch": 1.19,
"learning_rate": 3.3426966292134826e-07,
"logps/chosen": -22.498619079589844,
"logps/rejected": -30.868057250976562,
"loss": 0.6295,
"losses/dpo": 0.6400988101959229,
"losses/sft": 0.724359929561615,
"losses/total": 0.6400988101959229,
"ref_logps/chosen": -22.199575424194336,
"ref_logps/rejected": -29.1011962890625,
"rewards/accuracies": 0.7109375,
"rewards/chosen": -0.029904408380389214,
"rewards/margins": 0.1467815339565277,
"rewards/rejected": -0.17668592929840088,
"step": 158
},
{
"epoch": 1.2,
"learning_rate": 3.328651685393258e-07,
"logps/chosen": -24.872241973876953,
"logps/rejected": -29.327089309692383,
"loss": 0.6331,
"losses/dpo": 0.6349748373031616,
"losses/sft": 0.7728020548820496,
"losses/total": 0.6349748373031616,
"ref_logps/chosen": -24.600563049316406,
"ref_logps/rejected": -27.632978439331055,
"rewards/accuracies": 0.6953125,
"rewards/chosen": -0.027167750522494316,
"rewards/margins": 0.14224328100681305,
"rewards/rejected": -0.16941101849079132,
"step": 159
},
{
"epoch": 1.21,
"learning_rate": 3.314606741573033e-07,
"logps/chosen": -25.719676971435547,
"logps/rejected": -28.384960174560547,
"loss": 0.6269,
"losses/dpo": 0.6175022721290588,
"losses/sft": 0.8887324929237366,
"losses/total": 0.6175022721290588,
"ref_logps/chosen": -25.583393096923828,
"ref_logps/rejected": -26.621837615966797,
"rewards/accuracies": 0.703125,
"rewards/chosen": -0.013628311455249786,
"rewards/margins": 0.16268408298492432,
"rewards/rejected": -0.1763123720884323,
"step": 160
},
{
"epoch": 1.22,
"learning_rate": 3.300561797752809e-07,
"logps/chosen": -20.547767639160156,
"logps/rejected": -26.39871597290039,
"loss": 0.6418,
"losses/dpo": 0.604182243347168,
"losses/sft": 0.63340824842453,
"losses/total": 0.604182243347168,
"ref_logps/chosen": -20.272342681884766,
"ref_logps/rejected": -24.899860382080078,
"rewards/accuracies": 0.7109375,
"rewards/chosen": -0.027542442083358765,
"rewards/margins": 0.12234312295913696,
"rewards/rejected": -0.14988556504249573,
"step": 161
},
{
"epoch": 1.22,
"learning_rate": 3.2865168539325844e-07,
"logps/chosen": -22.42629623413086,
"logps/rejected": -27.69287872314453,
"loss": 0.6111,
"losses/dpo": 0.5942946672439575,
"losses/sft": 0.9472201466560364,
"losses/total": 0.5942946672439575,
"ref_logps/chosen": -22.135528564453125,
"ref_logps/rejected": -25.469520568847656,
"rewards/accuracies": 0.7421875,
"rewards/chosen": -0.029076654464006424,
"rewards/margins": 0.1932588517665863,
"rewards/rejected": -0.22233551740646362,
"step": 162
},
{
"epoch": 1.23,
"learning_rate": 3.2724719101123594e-07,
"logps/chosen": -23.306896209716797,
"logps/rejected": -28.64287567138672,
"loss": 0.6467,
"losses/dpo": 0.6821013689041138,
"losses/sft": 0.9050745368003845,
"losses/total": 0.6821013689041138,
"ref_logps/chosen": -22.86626625061035,
"ref_logps/rejected": -27.0958251953125,
"rewards/accuracies": 0.671875,
"rewards/chosen": -0.04406279698014259,
"rewards/margins": 0.11064193397760391,
"rewards/rejected": -0.1547047346830368,
"step": 163
},
{
"epoch": 1.24,
"learning_rate": 3.258426966292135e-07,
"logps/chosen": -24.126543045043945,
"logps/rejected": -26.020713806152344,
"loss": 0.6214,
"losses/dpo": 0.6081950664520264,
"losses/sft": 0.827450692653656,
"losses/total": 0.6081950664520264,
"ref_logps/chosen": -23.934072494506836,
"ref_logps/rejected": -24.092741012573242,
"rewards/accuracies": 0.765625,
"rewards/chosen": -0.01924710161983967,
"rewards/margins": 0.17355017364025116,
"rewards/rejected": -0.19279725849628448,
"step": 164
},
{
"epoch": 1.25,
"learning_rate": 3.24438202247191e-07,
"logps/chosen": -23.07083511352539,
"logps/rejected": -29.666513442993164,
"loss": 0.6401,
"losses/dpo": 0.6096771955490112,
"losses/sft": 0.7951339483261108,
"losses/total": 0.6096771955490112,
"ref_logps/chosen": -22.55157470703125,
"ref_logps/rejected": -27.86458396911621,
"rewards/accuracies": 0.6796875,
"rewards/chosen": -0.05192602425813675,
"rewards/margins": 0.12826718389987946,
"rewards/rejected": -0.1801932007074356,
"step": 165
},
{
"epoch": 1.25,
"learning_rate": 3.2303370786516856e-07,
"logps/chosen": -23.97926139831543,
"logps/rejected": -26.387611389160156,
"loss": 0.6543,
"losses/dpo": 0.5806229710578918,
"losses/sft": 0.9021787047386169,
"losses/total": 0.5806229710578918,
"ref_logps/chosen": -23.097957611083984,
"ref_logps/rejected": -24.48831558227539,
"rewards/accuracies": 0.609375,
"rewards/chosen": -0.0881301686167717,
"rewards/margins": 0.10179921984672546,
"rewards/rejected": -0.18992936611175537,
"step": 166
},
{
"epoch": 1.26,
"learning_rate": 3.21629213483146e-07,
"logps/chosen": -24.496349334716797,
"logps/rejected": -28.20893669128418,
"loss": 0.6439,
"losses/dpo": 0.5786381959915161,
"losses/sft": 0.9020153284072876,
"losses/total": 0.5786381959915161,
"ref_logps/chosen": -24.026926040649414,
"ref_logps/rejected": -26.54248046875,
"rewards/accuracies": 0.6484375,
"rewards/chosen": -0.04694243520498276,
"rewards/margins": 0.11970352381467819,
"rewards/rejected": -0.16664597392082214,
"step": 167
},
{
"epoch": 1.27,
"learning_rate": 3.2022471910112357e-07,
"logps/chosen": -23.227306365966797,
"logps/rejected": -29.19955825805664,
"loss": 0.6389,
"losses/dpo": 0.6521559953689575,
"losses/sft": 0.9907703399658203,
"losses/total": 0.6521559953689575,
"ref_logps/chosen": -22.804248809814453,
"ref_logps/rejected": -27.36874008178711,
"rewards/accuracies": 0.640625,
"rewards/chosen": -0.042305897921323776,
"rewards/margins": 0.14077602326869965,
"rewards/rejected": -0.18308192491531372,
"step": 168
},
{
"epoch": 1.28,
"learning_rate": 3.1882022471910107e-07,
"logps/chosen": -22.211841583251953,
"logps/rejected": -27.533721923828125,
"loss": 0.6512,
"losses/dpo": 0.6903020143508911,
"losses/sft": 0.8463045358657837,
"losses/total": 0.6903020143508911,
"ref_logps/chosen": -21.630611419677734,
"ref_logps/rejected": -25.909526824951172,
"rewards/accuracies": 0.6640625,
"rewards/chosen": -0.05812288075685501,
"rewards/margins": 0.10429678112268448,
"rewards/rejected": -0.1624196618795395,
"step": 169
},
{
"epoch": 1.28,
"learning_rate": 3.1741573033707863e-07,
"logps/chosen": -22.332489013671875,
"logps/rejected": -28.400074005126953,
"loss": 0.6155,
"losses/dpo": 0.6296464204788208,
"losses/sft": 0.6626120805740356,
"losses/total": 0.6296464204788208,
"ref_logps/chosen": -22.126087188720703,
"ref_logps/rejected": -26.414535522460938,
"rewards/accuracies": 0.796875,
"rewards/chosen": -0.020640213042497635,
"rewards/margins": 0.17791378498077393,
"rewards/rejected": -0.19855400919914246,
"step": 170
},
{
"epoch": 1.29,
"learning_rate": 3.160112359550562e-07,
"logps/chosen": -23.771900177001953,
"logps/rejected": -30.088207244873047,
"loss": 0.5971,
"losses/dpo": 0.6422166228294373,
"losses/sft": 0.7472187876701355,
"losses/total": 0.6422166228294373,
"ref_logps/chosen": -23.688966751098633,
"ref_logps/rejected": -27.655853271484375,
"rewards/accuracies": 0.7734375,
"rewards/chosen": -0.008293594233691692,
"rewards/margins": 0.23494186997413635,
"rewards/rejected": -0.24323543906211853,
"step": 171
},
{
"epoch": 1.3,
"learning_rate": 3.146067415730337e-07,
"logps/chosen": -23.348037719726562,
"logps/rejected": -27.53687286376953,
"loss": 0.6459,
"losses/dpo": 0.6455183029174805,
"losses/sft": 0.8395851850509644,
"losses/total": 0.6455183029174805,
"ref_logps/chosen": -22.63860511779785,
"ref_logps/rejected": -25.60868263244629,
"rewards/accuracies": 0.6640625,
"rewards/chosen": -0.0709431990981102,
"rewards/margins": 0.12187594175338745,
"rewards/rejected": -0.19281914830207825,
"step": 172
},
{
"epoch": 1.31,
"learning_rate": 3.1320224719101125e-07,
"logps/chosen": -24.17770767211914,
"logps/rejected": -30.49142074584961,
"loss": 0.627,
"losses/dpo": 0.6627662181854248,
"losses/sft": 0.9079832434654236,
"losses/total": 0.6627662181854248,
"ref_logps/chosen": -23.401166915893555,
"ref_logps/rejected": -28.0411376953125,
"rewards/accuracies": 0.6796875,
"rewards/chosen": -0.07765418291091919,
"rewards/margins": 0.16737422347068787,
"rewards/rejected": -0.24502840638160706,
"step": 173
},
{
"epoch": 1.31,
"learning_rate": 3.1179775280898875e-07,
"logps/chosen": -24.392324447631836,
"logps/rejected": -27.670101165771484,
"loss": 0.6251,
"losses/dpo": 0.6143248081207275,
"losses/sft": 0.6558141112327576,
"losses/total": 0.6143248081207275,
"ref_logps/chosen": -23.9196720123291,
"ref_logps/rejected": -25.53693389892578,
"rewards/accuracies": 0.7265625,
"rewards/chosen": -0.047265198081731796,
"rewards/margins": 0.16605158150196075,
"rewards/rejected": -0.21331676840782166,
"step": 174
},
{
"epoch": 1.32,
"learning_rate": 3.103932584269663e-07,
"logps/chosen": -24.742660522460938,
"logps/rejected": -33.37188720703125,
"loss": 0.6157,
"losses/dpo": 0.5933184623718262,
"losses/sft": 0.9941530227661133,
"losses/total": 0.5933184623718262,
"ref_logps/chosen": -24.150442123413086,
"ref_logps/rejected": -30.931093215942383,
"rewards/accuracies": 0.71875,
"rewards/chosen": -0.05922209471464157,
"rewards/margins": 0.1848573535680771,
"rewards/rejected": -0.24407947063446045,
"step": 175
},
{
"epoch": 1.33,
"learning_rate": 3.0898876404494376e-07,
"logps/chosen": -23.68863296508789,
"logps/rejected": -28.516223907470703,
"loss": 0.6428,
"losses/dpo": 0.6548395156860352,
"losses/sft": 0.9564076066017151,
"losses/total": 0.6548395156860352,
"ref_logps/chosen": -22.73943519592285,
"ref_logps/rejected": -26.3284912109375,
"rewards/accuracies": 0.6875,
"rewards/chosen": -0.09491994976997375,
"rewards/margins": 0.12385320663452148,
"rewards/rejected": -0.21877314150333405,
"step": 176
},
{
"epoch": 1.34,
"learning_rate": 3.075842696629213e-07,
"logps/chosen": -22.527427673339844,
"logps/rejected": -31.241607666015625,
"loss": 0.6179,
"losses/dpo": 0.5700336694717407,
"losses/sft": 0.8869008421897888,
"losses/total": 0.5700336694717407,
"ref_logps/chosen": -21.73688507080078,
"ref_logps/rejected": -28.55950164794922,
"rewards/accuracies": 0.6875,
"rewards/chosen": -0.07905411720275879,
"rewards/margins": 0.18915657699108124,
"rewards/rejected": -0.2682107090950012,
"step": 177
},
{
"epoch": 1.34,
"learning_rate": 3.0617977528089887e-07,
"logps/chosen": -22.916969299316406,
"logps/rejected": -26.000946044921875,
"loss": 0.6425,
"losses/dpo": 0.651595413684845,
"losses/sft": 0.8127326369285583,
"losses/total": 0.651595413684845,
"ref_logps/chosen": -22.194671630859375,
"ref_logps/rejected": -23.969348907470703,
"rewards/accuracies": 0.671875,
"rewards/chosen": -0.07222998142242432,
"rewards/margins": 0.1309295892715454,
"rewards/rejected": -0.20315957069396973,
"step": 178
},
{
"epoch": 1.35,
"learning_rate": 3.047752808988764e-07,
"logps/chosen": -20.320987701416016,
"logps/rejected": -27.46251106262207,
"loss": 0.6217,
"losses/dpo": 0.7334872484207153,
"losses/sft": 0.9430239200592041,
"losses/total": 0.7334872484207153,
"ref_logps/chosen": -19.424144744873047,
"ref_logps/rejected": -24.763113021850586,
"rewards/accuracies": 0.6953125,
"rewards/chosen": -0.08968427777290344,
"rewards/margins": 0.18025556206703186,
"rewards/rejected": -0.2699398398399353,
"step": 179
},
{
"epoch": 1.36,
"learning_rate": 3.0337078651685393e-07,
"logps/chosen": -23.853857040405273,
"logps/rejected": -27.422889709472656,
"loss": 0.6381,
"losses/dpo": 0.6393001079559326,
"losses/sft": 0.766620397567749,
"losses/total": 0.6393001079559326,
"ref_logps/chosen": -22.798233032226562,
"ref_logps/rejected": -24.980205535888672,
"rewards/accuracies": 0.6796875,
"rewards/chosen": -0.10556241869926453,
"rewards/margins": 0.13870559632778168,
"rewards/rejected": -0.2442680299282074,
"step": 180
},
{
"epoch": 1.37,
"learning_rate": 3.0196629213483144e-07,
"logps/chosen": -23.360549926757812,
"logps/rejected": -27.110477447509766,
"loss": 0.6234,
"losses/dpo": 0.6311055421829224,
"losses/sft": 0.9324018955230713,
"losses/total": 0.6311055421829224,
"ref_logps/chosen": -22.890331268310547,
"ref_logps/rejected": -24.903316497802734,
"rewards/accuracies": 0.7421875,
"rewards/chosen": -0.04702185466885567,
"rewards/margins": 0.17369432747364044,
"rewards/rejected": -0.2207161784172058,
"step": 181
},
{
"epoch": 1.37,
"learning_rate": 3.00561797752809e-07,
"logps/chosen": -23.004093170166016,
"logps/rejected": -31.04292106628418,
"loss": 0.5926,
"losses/dpo": 0.6243355870246887,
"losses/sft": 0.8456003665924072,
"losses/total": 0.6243355870246887,
"ref_logps/chosen": -22.555362701416016,
"ref_logps/rejected": -28.14826011657715,
"rewards/accuracies": 0.7578125,
"rewards/chosen": -0.04487309604883194,
"rewards/margins": 0.24459321796894073,
"rewards/rejected": -0.28946632146835327,
"step": 182
},
{
"epoch": 1.38,
"learning_rate": 2.991573033707865e-07,
"logps/chosen": -26.668237686157227,
"logps/rejected": -30.511489868164062,
"loss": 0.6099,
"losses/dpo": 0.6743872761726379,
"losses/sft": 0.836949348449707,
"losses/total": 0.6743872761726379,
"ref_logps/chosen": -25.680599212646484,
"ref_logps/rejected": -27.35342788696289,
"rewards/accuracies": 0.7265625,
"rewards/chosen": -0.09876400232315063,
"rewards/margins": 0.2170422226190567,
"rewards/rejected": -0.31580623984336853,
"step": 183
},
{
"epoch": 1.39,
"learning_rate": 2.9775280898876406e-07,
"logps/chosen": -23.974590301513672,
"logps/rejected": -28.162975311279297,
"loss": 0.6119,
"losses/dpo": 0.5823447704315186,
"losses/sft": 0.8065779805183411,
"losses/total": 0.5823447704315186,
"ref_logps/chosen": -23.180667877197266,
"ref_logps/rejected": -25.296037673950195,
"rewards/accuracies": 0.7109375,
"rewards/chosen": -0.07939236611127853,
"rewards/margins": 0.20730134844779968,
"rewards/rejected": -0.2866936922073364,
"step": 184
},
{
"epoch": 1.4,
"learning_rate": 2.9634831460674156e-07,
"logps/chosen": -24.01116943359375,
"logps/rejected": -30.05943489074707,
"loss": 0.6203,
"losses/dpo": 0.5889841318130493,
"losses/sft": 0.8877280354499817,
"losses/total": 0.5889841318130493,
"ref_logps/chosen": -22.781108856201172,
"ref_logps/rejected": -27.01274871826172,
"rewards/accuracies": 0.734375,
"rewards/chosen": -0.12300599366426468,
"rewards/margins": 0.18166252970695496,
"rewards/rejected": -0.3046685457229614,
"step": 185
},
{
"epoch": 1.4,
"learning_rate": 2.9494382022471906e-07,
"logps/chosen": -22.79621124267578,
"logps/rejected": -28.1258544921875,
"loss": 0.6198,
"losses/dpo": 0.6025291681289673,
"losses/sft": 0.93308424949646,
"losses/total": 0.6025291681289673,
"ref_logps/chosen": -21.591278076171875,
"ref_logps/rejected": -25.04897689819336,
"rewards/accuracies": 0.7421875,
"rewards/chosen": -0.12049318104982376,
"rewards/margins": 0.18719442188739777,
"rewards/rejected": -0.3076876401901245,
"step": 186
},
{
"epoch": 1.41,
"learning_rate": 2.935393258426966e-07,
"logps/chosen": -24.246837615966797,
"logps/rejected": -30.58446502685547,
"loss": 0.6277,
"losses/dpo": 0.5978178977966309,
"losses/sft": 0.7778979539871216,
"losses/total": 0.5978178977966309,
"ref_logps/chosen": -23.127248764038086,
"ref_logps/rejected": -27.7061767578125,
"rewards/accuracies": 0.6875,
"rewards/chosen": -0.11195877939462662,
"rewards/margins": 0.17587023973464966,
"rewards/rejected": -0.2878290116786957,
"step": 187
},
{
"epoch": 1.42,
"learning_rate": 2.921348314606741e-07,
"logps/chosen": -24.55533218383789,
"logps/rejected": -29.098743438720703,
"loss": 0.6458,
"losses/dpo": 0.6147331595420837,
"losses/sft": 0.8299495577812195,
"losses/total": 0.6147331595420837,
"ref_logps/chosen": -23.001358032226562,
"ref_logps/rejected": -26.009681701660156,
"rewards/accuracies": 0.59375,
"rewards/chosen": -0.1553977131843567,
"rewards/margins": 0.15350814163684845,
"rewards/rejected": -0.30890583992004395,
"step": 188
},
{
"epoch": 1.43,
"learning_rate": 2.907303370786517e-07,
"logps/chosen": -22.7973690032959,
"logps/rejected": -30.61502456665039,
"loss": 0.5968,
"losses/dpo": 0.5409806370735168,
"losses/sft": 0.8110998272895813,
"losses/total": 0.5409806370735168,
"ref_logps/chosen": -21.900728225708008,
"ref_logps/rejected": -27.346271514892578,
"rewards/accuracies": 0.7578125,
"rewards/chosen": -0.08966411650180817,
"rewards/margins": 0.23721098899841309,
"rewards/rejected": -0.32687509059906006,
"step": 189
},
{
"epoch": 1.43,
"learning_rate": 2.893258426966292e-07,
"logps/chosen": -21.656837463378906,
"logps/rejected": -28.09313201904297,
"loss": 0.636,
"losses/dpo": 0.6395488977432251,
"losses/sft": 0.8838689923286438,
"losses/total": 0.6395488977432251,
"ref_logps/chosen": -20.17813491821289,
"ref_logps/rejected": -25.027902603149414,
"rewards/accuracies": 0.6484375,
"rewards/chosen": -0.14787010848522186,
"rewards/margins": 0.15865309536457062,
"rewards/rejected": -0.3065232038497925,
"step": 190
},
{
"epoch": 1.44,
"learning_rate": 2.8792134831460674e-07,
"logps/chosen": -23.13861083984375,
"logps/rejected": -32.06410217285156,
"loss": 0.6131,
"losses/dpo": 0.6822565197944641,
"losses/sft": 0.7876338362693787,
"losses/total": 0.6822565197944641,
"ref_logps/chosen": -21.99342918395996,
"ref_logps/rejected": -28.761310577392578,
"rewards/accuracies": 0.71875,
"rewards/chosen": -0.11451825499534607,
"rewards/margins": 0.21576061844825745,
"rewards/rejected": -0.33027884364128113,
"step": 191
},
{
"epoch": 1.45,
"learning_rate": 2.8651685393258425e-07,
"logps/chosen": -22.36726951599121,
"logps/rejected": -27.791099548339844,
"loss": 0.6132,
"losses/dpo": 0.5694007873535156,
"losses/sft": 0.7940797805786133,
"losses/total": 0.5694007873535156,
"ref_logps/chosen": -21.662071228027344,
"ref_logps/rejected": -25.100269317626953,
"rewards/accuracies": 0.7265625,
"rewards/chosen": -0.07051999121904373,
"rewards/margins": 0.19856315851211548,
"rewards/rejected": -0.2690831422805786,
"step": 192
},
{
"epoch": 1.46,
"learning_rate": 2.851123595505618e-07,
"logps/chosen": -24.541927337646484,
"logps/rejected": -30.479598999023438,
"loss": 0.6251,
"losses/dpo": 0.6676912307739258,
"losses/sft": 0.8101266026496887,
"losses/total": 0.6676912307739258,
"ref_logps/chosen": -23.375761032104492,
"ref_logps/rejected": -27.565099716186523,
"rewards/accuracies": 0.7109375,
"rewards/chosen": -0.11661653220653534,
"rewards/margins": 0.1748332977294922,
"rewards/rejected": -0.2914498448371887,
"step": 193
},
{
"epoch": 1.46,
"learning_rate": 2.8370786516853936e-07,
"logps/chosen": -23.954505920410156,
"logps/rejected": -30.262849807739258,
"loss": 0.6289,
"losses/dpo": 0.6359354257583618,
"losses/sft": 0.846460223197937,
"losses/total": 0.6359354257583618,
"ref_logps/chosen": -22.69145965576172,
"ref_logps/rejected": -27.221202850341797,
"rewards/accuracies": 0.671875,
"rewards/chosen": -0.12630482017993927,
"rewards/margins": 0.1778600960969925,
"rewards/rejected": -0.30416491627693176,
"step": 194
},
{
"epoch": 1.47,
"learning_rate": 2.823033707865168e-07,
"logps/chosen": -25.615474700927734,
"logps/rejected": -32.26765823364258,
"loss": 0.6017,
"losses/dpo": 0.6264960765838623,
"losses/sft": 0.906339704990387,
"losses/total": 0.6264960765838623,
"ref_logps/chosen": -24.62253189086914,
"ref_logps/rejected": -29.02202796936035,
"rewards/accuracies": 0.7421875,
"rewards/chosen": -0.09929438680410385,
"rewards/margins": 0.22526855766773224,
"rewards/rejected": -0.3245629370212555,
"step": 195
},
{
"epoch": 1.48,
"learning_rate": 2.8089887640449437e-07,
"logps/chosen": -22.84251594543457,
"logps/rejected": -28.347021102905273,
"loss": 0.6191,
"losses/dpo": 0.6483104825019836,
"losses/sft": 0.9074235558509827,
"losses/total": 0.6483104825019836,
"ref_logps/chosen": -21.419048309326172,
"ref_logps/rejected": -25.032745361328125,
"rewards/accuracies": 0.703125,
"rewards/chosen": -0.14234672486782074,
"rewards/margins": 0.18908075988292694,
"rewards/rejected": -0.3314274847507477,
"step": 196
},
{
"epoch": 1.49,
"learning_rate": 2.794943820224719e-07,
"logps/chosen": -23.73548126220703,
"logps/rejected": -28.329975128173828,
"loss": 0.6238,
"losses/dpo": 0.6014984250068665,
"losses/sft": 0.773016631603241,
"losses/total": 0.6014984250068665,
"ref_logps/chosen": -22.425506591796875,
"ref_logps/rejected": -25.099872589111328,
"rewards/accuracies": 0.6796875,
"rewards/chosen": -0.13099724054336548,
"rewards/margins": 0.19201286137104034,
"rewards/rejected": -0.323010116815567,
"step": 197
},
{
"epoch": 1.49,
"learning_rate": 2.7808988764044943e-07,
"logps/chosen": -26.183156967163086,
"logps/rejected": -30.921403884887695,
"loss": 0.625,
"losses/dpo": 0.6309884190559387,
"losses/sft": 0.8918415307998657,
"losses/total": 0.6309884190559387,
"ref_logps/chosen": -24.701202392578125,
"ref_logps/rejected": -27.679357528686523,
"rewards/accuracies": 0.6953125,
"rewards/chosen": -0.14819550514221191,
"rewards/margins": 0.17600935697555542,
"rewards/rejected": -0.32420486211776733,
"step": 198
},
{
"epoch": 1.5,
"learning_rate": 2.7668539325842694e-07,
"logps/chosen": -23.88658905029297,
"logps/rejected": -29.73432731628418,
"loss": 0.6156,
"losses/dpo": 0.6188192367553711,
"losses/sft": 0.8410817384719849,
"losses/total": 0.6188192367553711,
"ref_logps/chosen": -22.504894256591797,
"ref_logps/rejected": -26.384294509887695,
"rewards/accuracies": 0.6328125,
"rewards/chosen": -0.13816949725151062,
"rewards/margins": 0.19683387875556946,
"rewards/rejected": -0.3350033462047577,
"step": 199
},
{
"epoch": 1.51,
"learning_rate": 2.752808988764045e-07,
"logps/chosen": -23.145811080932617,
"logps/rejected": -29.627685546875,
"loss": 0.637,
"losses/dpo": 0.6995939612388611,
"losses/sft": 0.9283435344696045,
"losses/total": 0.6995939612388611,
"ref_logps/chosen": -21.45529556274414,
"ref_logps/rejected": -26.336702346801758,
"rewards/accuracies": 0.625,
"rewards/chosen": -0.1690514236688614,
"rewards/margins": 0.16004663705825806,
"rewards/rejected": -0.32909804582595825,
"step": 200
},
{
"epoch": 1.52,
"learning_rate": 2.73876404494382e-07,
"logps/chosen": -22.545406341552734,
"logps/rejected": -30.04849624633789,
"loss": 0.608,
"losses/dpo": 0.6513813734054565,
"losses/sft": 0.9403305649757385,
"losses/total": 0.6513813734054565,
"ref_logps/chosen": -21.010854721069336,
"ref_logps/rejected": -26.149032592773438,
"rewards/accuracies": 0.671875,
"rewards/chosen": -0.15345513820648193,
"rewards/margins": 0.23649117350578308,
"rewards/rejected": -0.3899462819099426,
"step": 201
},
{
"epoch": 1.52,
"learning_rate": 2.7247191011235955e-07,
"logps/chosen": -22.640438079833984,
"logps/rejected": -28.583681106567383,
"loss": 0.6007,
"losses/dpo": 0.5443820357322693,
"losses/sft": 0.8517413139343262,
"losses/total": 0.5443820357322693,
"ref_logps/chosen": -21.29751968383789,
"ref_logps/rejected": -24.850605010986328,
"rewards/accuracies": 0.765625,
"rewards/chosen": -0.1342916190624237,
"rewards/margins": 0.23901620507240295,
"rewards/rejected": -0.37330782413482666,
"step": 202
},
{
"epoch": 1.53,
"learning_rate": 2.710674157303371e-07,
"logps/chosen": -25.259624481201172,
"logps/rejected": -32.96052551269531,
"loss": 0.6029,
"losses/dpo": 0.5749891996383667,
"losses/sft": 0.9417051672935486,
"losses/total": 0.5749891996383667,
"ref_logps/chosen": -23.468887329101562,
"ref_logps/rejected": -28.776565551757812,
"rewards/accuracies": 0.7421875,
"rewards/chosen": -0.17907381057739258,
"rewards/margins": 0.23932181298732758,
"rewards/rejected": -0.41839560866355896,
"step": 203
},
{
"epoch": 1.54,
"learning_rate": 2.6966292134831456e-07,
"logps/chosen": -24.431142807006836,
"logps/rejected": -31.409852981567383,
"loss": 0.6256,
"losses/dpo": 0.6045551896095276,
"losses/sft": 0.8162484169006348,
"losses/total": 0.6045551896095276,
"ref_logps/chosen": -22.7187442779541,
"ref_logps/rejected": -27.74604606628418,
"rewards/accuracies": 0.703125,
"rewards/chosen": -0.1712397187948227,
"rewards/margins": 0.19514092803001404,
"rewards/rejected": -0.36638063192367554,
"step": 204
},
{
"epoch": 1.55,
"learning_rate": 2.682584269662921e-07,
"logps/chosen": -22.776988983154297,
"logps/rejected": -30.418426513671875,
"loss": 0.6093,
"losses/dpo": 0.630817711353302,
"losses/sft": 0.907343327999115,
"losses/total": 0.630817711353302,
"ref_logps/chosen": -21.150266647338867,
"ref_logps/rejected": -26.52399444580078,
"rewards/accuracies": 0.734375,
"rewards/chosen": -0.16267219185829163,
"rewards/margins": 0.22677099704742432,
"rewards/rejected": -0.38944315910339355,
"step": 205
},
{
"epoch": 1.55,
"learning_rate": 2.668539325842696e-07,
"logps/chosen": -24.300395965576172,
"logps/rejected": -32.63694763183594,
"loss": 0.5834,
"losses/dpo": 0.5977815389633179,
"losses/sft": 0.8870611190795898,
"losses/total": 0.5977815389633179,
"ref_logps/chosen": -22.923202514648438,
"ref_logps/rejected": -28.30066680908203,
"rewards/accuracies": 0.71875,
"rewards/chosen": -0.13771943747997284,
"rewards/margins": 0.29590874910354614,
"rewards/rejected": -0.4336281716823578,
"step": 206
},
{
"epoch": 1.56,
"learning_rate": 2.654494382022472e-07,
"logps/chosen": -25.562063217163086,
"logps/rejected": -28.686279296875,
"loss": 0.6248,
"losses/dpo": 0.593975841999054,
"losses/sft": 0.8298511505126953,
"losses/total": 0.593975841999054,
"ref_logps/chosen": -23.863605499267578,
"ref_logps/rejected": -24.980735778808594,
"rewards/accuracies": 0.6171875,
"rewards/chosen": -0.16984564065933228,
"rewards/margins": 0.2007087767124176,
"rewards/rejected": -0.3705544173717499,
"step": 207
},
{
"epoch": 1.57,
"learning_rate": 2.640449438202247e-07,
"logps/chosen": -24.133087158203125,
"logps/rejected": -32.86896514892578,
"loss": 0.6072,
"losses/dpo": 0.5785881280899048,
"losses/sft": 0.9283973574638367,
"losses/total": 0.5785881280899048,
"ref_logps/chosen": -22.546520233154297,
"ref_logps/rejected": -28.988473892211914,
"rewards/accuracies": 0.671875,
"rewards/chosen": -0.158656507730484,
"rewards/margins": 0.2293928861618042,
"rewards/rejected": -0.3880493640899658,
"step": 208
},
{
"epoch": 1.58,
"learning_rate": 2.6264044943820224e-07,
"logps/chosen": -21.610166549682617,
"logps/rejected": -33.77753448486328,
"loss": 0.5743,
"losses/dpo": 0.5111271142959595,
"losses/sft": 0.7807843685150146,
"losses/total": 0.5111271142959595,
"ref_logps/chosen": -20.26101303100586,
"ref_logps/rejected": -29.390432357788086,
"rewards/accuracies": 0.7578125,
"rewards/chosen": -0.1349155306816101,
"rewards/margins": 0.3037945628166199,
"rewards/rejected": -0.43871009349823,
"step": 209
},
{
"epoch": 1.58,
"learning_rate": 2.612359550561798e-07,
"logps/chosen": -24.600027084350586,
"logps/rejected": -28.993408203125,
"loss": 0.621,
"losses/dpo": 0.6254321336746216,
"losses/sft": 0.7647839188575745,
"losses/total": 0.6254321336746216,
"ref_logps/chosen": -22.838638305664062,
"ref_logps/rejected": -25.172962188720703,
"rewards/accuracies": 0.6796875,
"rewards/chosen": -0.1761387437582016,
"rewards/margins": 0.20590564608573914,
"rewards/rejected": -0.38204440474510193,
"step": 210
},
{
"epoch": 1.59,
"learning_rate": 2.598314606741573e-07,
"logps/chosen": -25.24309730529785,
"logps/rejected": -32.02477264404297,
"loss": 0.6078,
"losses/dpo": 0.6571998000144958,
"losses/sft": 0.8880329728126526,
"losses/total": 0.6571998000144958,
"ref_logps/chosen": -23.272363662719727,
"ref_logps/rejected": -27.582080841064453,
"rewards/accuracies": 0.71875,
"rewards/chosen": -0.19707328081130981,
"rewards/margins": 0.24719560146331787,
"rewards/rejected": -0.4442688822746277,
"step": 211
},
{
"epoch": 1.6,
"learning_rate": 2.5842696629213486e-07,
"logps/chosen": -23.570541381835938,
"logps/rejected": -31.662994384765625,
"loss": 0.5954,
"losses/dpo": 0.6153095960617065,
"losses/sft": 0.7867841720581055,
"losses/total": 0.6153095960617065,
"ref_logps/chosen": -21.58125114440918,
"ref_logps/rejected": -27.1029052734375,
"rewards/accuracies": 0.78125,
"rewards/chosen": -0.19892916083335876,
"rewards/margins": 0.2570798993110657,
"rewards/rejected": -0.45600906014442444,
"step": 212
},
{
"epoch": 1.61,
"learning_rate": 2.5702247191011236e-07,
"logps/chosen": -26.515090942382812,
"logps/rejected": -33.26690673828125,
"loss": 0.5944,
"losses/dpo": 0.559239387512207,
"losses/sft": 0.8030417561531067,
"losses/total": 0.559239387512207,
"ref_logps/chosen": -24.70389175415039,
"ref_logps/rejected": -28.788631439208984,
"rewards/accuracies": 0.7421875,
"rewards/chosen": -0.18111974000930786,
"rewards/margins": 0.2667076587677002,
"rewards/rejected": -0.44782739877700806,
"step": 213
},
{
"epoch": 1.62,
"learning_rate": 2.5561797752808987e-07,
"logps/chosen": -23.109725952148438,
"logps/rejected": -30.950822830200195,
"loss": 0.6028,
"losses/dpo": 0.6463332772254944,
"losses/sft": 0.867030918598175,
"losses/total": 0.6463332772254944,
"ref_logps/chosen": -21.529489517211914,
"ref_logps/rejected": -26.95291519165039,
"rewards/accuracies": 0.7421875,
"rewards/chosen": -0.15802377462387085,
"rewards/margins": 0.24176692962646484,
"rewards/rejected": -0.3997907340526581,
"step": 214
},
{
"epoch": 1.62,
"learning_rate": 2.5421348314606737e-07,
"logps/chosen": -22.45772933959961,
"logps/rejected": -30.6645450592041,
"loss": 0.548,
"losses/dpo": 0.49787038564682007,
"losses/sft": 0.9076435565948486,
"losses/total": 0.49787038564682007,
"ref_logps/chosen": -21.689294815063477,
"ref_logps/rejected": -26.167482376098633,
"rewards/accuracies": 0.796875,
"rewards/chosen": -0.07684363424777985,
"rewards/margins": 0.37286245822906494,
"rewards/rejected": -0.449706107378006,
"step": 215
},
{
"epoch": 1.63,
"learning_rate": 2.5280898876404493e-07,
"logps/chosen": -23.930644989013672,
"logps/rejected": -31.34885597229004,
"loss": 0.5791,
"losses/dpo": 0.6228358745574951,
"losses/sft": 0.894844651222229,
"losses/total": 0.6228358745574951,
"ref_logps/chosen": -22.003002166748047,
"ref_logps/rejected": -26.41282844543457,
"rewards/accuracies": 0.7265625,
"rewards/chosen": -0.19276437163352966,
"rewards/margins": 0.3008383512496948,
"rewards/rejected": -0.4936027228832245,
"step": 216
},
{
"epoch": 1.64,
"learning_rate": 2.5140449438202243e-07,
"logps/chosen": -25.59225082397461,
"logps/rejected": -30.82415199279785,
"loss": 0.5571,
"losses/dpo": 0.5233840942382812,
"losses/sft": 0.8860921263694763,
"losses/total": 0.5233840942382812,
"ref_logps/chosen": -23.89864730834961,
"ref_logps/rejected": -25.53179168701172,
"rewards/accuracies": 0.765625,
"rewards/chosen": -0.16936028003692627,
"rewards/margins": 0.35987579822540283,
"rewards/rejected": -0.5292361378669739,
"step": 217
},
{
"epoch": 1.65,
"learning_rate": 2.5e-07,
"logps/chosen": -26.896615982055664,
"logps/rejected": -32.64814376831055,
"loss": 0.5821,
"losses/dpo": 0.5345016121864319,
"losses/sft": 0.9819333553314209,
"losses/total": 0.5345016121864319,
"ref_logps/chosen": -24.95808982849121,
"ref_logps/rejected": -27.606571197509766,
"rewards/accuracies": 0.703125,
"rewards/chosen": -0.19385257363319397,
"rewards/margins": 0.3103046417236328,
"rewards/rejected": -0.5041571855545044,
"step": 218
},
{
"epoch": 1.65,
"learning_rate": 2.485955056179775e-07,
"logps/chosen": -21.461519241333008,
"logps/rejected": -29.887657165527344,
"loss": 0.5621,
"losses/dpo": 0.5603345632553101,
"losses/sft": 0.7855640649795532,
"losses/total": 0.5603345632553101,
"ref_logps/chosen": -20.122406005859375,
"ref_logps/rejected": -24.85255241394043,
"rewards/accuracies": 0.75,
"rewards/chosen": -0.13391147553920746,
"rewards/margins": 0.3695991039276123,
"rewards/rejected": -0.503510594367981,
"step": 219
},
{
"epoch": 1.66,
"learning_rate": 2.4719101123595505e-07,
"logps/chosen": -22.143098831176758,
"logps/rejected": -34.4566764831543,
"loss": 0.5923,
"losses/dpo": 0.5465586185455322,
"losses/sft": 1.051912546157837,
"losses/total": 0.5465586185455322,
"ref_logps/chosen": -20.453866958618164,
"ref_logps/rejected": -30.012981414794922,
"rewards/accuracies": 0.6796875,
"rewards/chosen": -0.16892319917678833,
"rewards/margins": 0.27544665336608887,
"rewards/rejected": -0.4443698525428772,
"step": 220
},
{
"epoch": 1.67,
"learning_rate": 2.4578651685393255e-07,
"logps/chosen": -24.042566299438477,
"logps/rejected": -29.772445678710938,
"loss": 0.6149,
"losses/dpo": 0.6469910144805908,
"losses/sft": 1.0151987075805664,
"losses/total": 0.6469910144805908,
"ref_logps/chosen": -21.836162567138672,
"ref_logps/rejected": -25.394845962524414,
"rewards/accuracies": 0.6328125,
"rewards/chosen": -0.220640629529953,
"rewards/margins": 0.2171194702386856,
"rewards/rejected": -0.4377601146697998,
"step": 221
},
{
"epoch": 1.68,
"learning_rate": 2.443820224719101e-07,
"logps/chosen": -24.834793090820312,
"logps/rejected": -33.834083557128906,
"loss": 0.5676,
"losses/dpo": 0.6051491498947144,
"losses/sft": 0.8380707502365112,
"losses/total": 0.6051491498947144,
"ref_logps/chosen": -22.940967559814453,
"ref_logps/rejected": -28.49428939819336,
"rewards/accuracies": 0.75,
"rewards/chosen": -0.18938273191452026,
"rewards/margins": 0.3445969223976135,
"rewards/rejected": -0.5339796543121338,
"step": 222
},
{
"epoch": 1.68,
"learning_rate": 2.429775280898876e-07,
"logps/chosen": -25.5327091217041,
"logps/rejected": -30.429113388061523,
"loss": 0.6089,
"losses/dpo": 0.5853685140609741,
"losses/sft": 0.6926910877227783,
"losses/total": 0.5853685140609741,
"ref_logps/chosen": -23.138214111328125,
"ref_logps/rejected": -25.674575805664062,
"rewards/accuracies": 0.7109375,
"rewards/chosen": -0.23944953083992004,
"rewards/margins": 0.2360040545463562,
"rewards/rejected": -0.47545361518859863,
"step": 223
},
{
"epoch": 1.69,
"learning_rate": 2.4157303370786517e-07,
"logps/chosen": -24.123153686523438,
"logps/rejected": -29.51090431213379,
"loss": 0.6134,
"losses/dpo": 0.7566800117492676,
"losses/sft": 0.9139145612716675,
"losses/total": 0.7566800117492676,
"ref_logps/chosen": -22.065155029296875,
"ref_logps/rejected": -24.94894027709961,
"rewards/accuracies": 0.7109375,
"rewards/chosen": -0.20579975843429565,
"rewards/margins": 0.250396728515625,
"rewards/rejected": -0.45619648694992065,
"step": 224
},
{
"epoch": 1.7,
"learning_rate": 2.401685393258427e-07,
"logps/chosen": -26.274799346923828,
"logps/rejected": -32.90815734863281,
"loss": 0.6145,
"losses/dpo": 0.6078730225563049,
"losses/sft": 1.1017650365829468,
"losses/total": 0.6078730225563049,
"ref_logps/chosen": -23.55907440185547,
"ref_logps/rejected": -27.880718231201172,
"rewards/accuracies": 0.65625,
"rewards/chosen": -0.2715725004673004,
"rewards/margins": 0.23117120563983917,
"rewards/rejected": -0.5027437210083008,
"step": 225
},
{
"epoch": 1.71,
"learning_rate": 2.3876404494382023e-07,
"logps/chosen": -25.727689743041992,
"logps/rejected": -30.410335540771484,
"loss": 0.6292,
"losses/dpo": 0.6031284332275391,
"losses/sft": 0.7834776639938354,
"losses/total": 0.6031284332275391,
"ref_logps/chosen": -23.430198669433594,
"ref_logps/rejected": -26.02400779724121,
"rewards/accuracies": 0.6796875,
"rewards/chosen": -0.22974896430969238,
"rewards/margins": 0.20888389647006989,
"rewards/rejected": -0.4386328458786011,
"step": 226
},
{
"epoch": 1.71,
"learning_rate": 2.3735955056179774e-07,
"logps/chosen": -25.917598724365234,
"logps/rejected": -31.14261245727539,
"loss": 0.5928,
"losses/dpo": 0.5714601874351501,
"losses/sft": 0.8888335227966309,
"losses/total": 0.5714601874351501,
"ref_logps/chosen": -23.436574935913086,
"ref_logps/rejected": -25.756431579589844,
"rewards/accuracies": 0.7265625,
"rewards/chosen": -0.24810227751731873,
"rewards/margins": 0.29051584005355835,
"rewards/rejected": -0.5386180877685547,
"step": 227
},
{
"epoch": 1.72,
"learning_rate": 2.3595505617977527e-07,
"logps/chosen": -25.50743865966797,
"logps/rejected": -34.945220947265625,
"loss": 0.5505,
"losses/dpo": 0.5715539455413818,
"losses/sft": 0.8663308620452881,
"losses/total": 0.5715539455413818,
"ref_logps/chosen": -23.417984008789062,
"ref_logps/rejected": -28.690208435058594,
"rewards/accuracies": 0.8203125,
"rewards/chosen": -0.20894566178321838,
"rewards/margins": 0.41655558347702026,
"rewards/rejected": -0.625501275062561,
"step": 228
},
{
"epoch": 1.73,
"learning_rate": 2.345505617977528e-07,
"logps/chosen": -23.620698928833008,
"logps/rejected": -34.89327621459961,
"loss": 0.571,
"losses/dpo": 0.6053493022918701,
"losses/sft": 0.8246825933456421,
"losses/total": 0.6053493022918701,
"ref_logps/chosen": -21.27004623413086,
"ref_logps/rejected": -29.035568237304688,
"rewards/accuracies": 0.7109375,
"rewards/chosen": -0.23506540060043335,
"rewards/margins": 0.35070547461509705,
"rewards/rejected": -0.585770845413208,
"step": 229
},
{
"epoch": 1.74,
"learning_rate": 2.331460674157303e-07,
"logps/chosen": -21.874225616455078,
"logps/rejected": -34.58841323852539,
"loss": 0.5745,
"losses/dpo": 0.5964910984039307,
"losses/sft": 0.842921793460846,
"losses/total": 0.5964910984039307,
"ref_logps/chosen": -19.500164031982422,
"ref_logps/rejected": -28.771209716796875,
"rewards/accuracies": 0.6796875,
"rewards/chosen": -0.23740598559379578,
"rewards/margins": 0.34431448578834534,
"rewards/rejected": -0.5817204713821411,
"step": 230
},
{
"epoch": 1.74,
"learning_rate": 2.3174157303370786e-07,
"logps/chosen": -24.84224510192871,
"logps/rejected": -32.233497619628906,
"loss": 0.6064,
"losses/dpo": 0.5861349105834961,
"losses/sft": 0.9263943433761597,
"losses/total": 0.5861349105834961,
"ref_logps/chosen": -21.88359832763672,
"ref_logps/rejected": -26.701745986938477,
"rewards/accuracies": 0.6796875,
"rewards/chosen": -0.29586488008499146,
"rewards/margins": 0.2573099732398987,
"rewards/rejected": -0.5531748533248901,
"step": 231
},
{
"epoch": 1.75,
"learning_rate": 2.303370786516854e-07,
"logps/chosen": -25.4254207611084,
"logps/rejected": -34.96025085449219,
"loss": 0.5747,
"losses/dpo": 0.5563768744468689,
"losses/sft": 0.9355225563049316,
"losses/total": 0.5563768744468689,
"ref_logps/chosen": -22.772850036621094,
"ref_logps/rejected": -28.902484893798828,
"rewards/accuracies": 0.734375,
"rewards/chosen": -0.26525697112083435,
"rewards/margins": 0.3405200242996216,
"rewards/rejected": -0.6057769656181335,
"step": 232
},
{
"epoch": 1.76,
"learning_rate": 2.2893258426966292e-07,
"logps/chosen": -26.856834411621094,
"logps/rejected": -34.212364196777344,
"loss": 0.6228,
"losses/dpo": 0.6681157946586609,
"losses/sft": 1.0442770719528198,
"losses/total": 0.6681157946586609,
"ref_logps/chosen": -23.627426147460938,
"ref_logps/rejected": -28.62677001953125,
"rewards/accuracies": 0.640625,
"rewards/chosen": -0.32294073700904846,
"rewards/margins": 0.23561875522136688,
"rewards/rejected": -0.5585595369338989,
"step": 233
},
{
"epoch": 1.77,
"learning_rate": 2.2752808988764045e-07,
"logps/chosen": -26.366958618164062,
"logps/rejected": -33.41276550292969,
"loss": 0.6217,
"losses/dpo": 0.6866650581359863,
"losses/sft": 0.8693393468856812,
"losses/total": 0.6866650581359863,
"ref_logps/chosen": -23.189382553100586,
"ref_logps/rejected": -27.676807403564453,
"rewards/accuracies": 0.703125,
"rewards/chosen": -0.31775763630867004,
"rewards/margins": 0.25583818554878235,
"rewards/rejected": -0.5735958218574524,
"step": 234
},
{
"epoch": 1.77,
"learning_rate": 2.2612359550561795e-07,
"logps/chosen": -24.26227569580078,
"logps/rejected": -32.4229736328125,
"loss": 0.604,
"losses/dpo": 0.5642524361610413,
"losses/sft": 0.9980260133743286,
"losses/total": 0.5642524361610413,
"ref_logps/chosen": -21.425315856933594,
"ref_logps/rejected": -26.820331573486328,
"rewards/accuracies": 0.609375,
"rewards/chosen": -0.28369593620300293,
"rewards/margins": 0.2765684127807617,
"rewards/rejected": -0.5602643489837646,
"step": 235
},
{
"epoch": 1.78,
"learning_rate": 2.2471910112359549e-07,
"logps/chosen": -27.912431716918945,
"logps/rejected": -31.85492706298828,
"loss": 0.6448,
"losses/dpo": 0.5940742492675781,
"losses/sft": 0.969171404838562,
"losses/total": 0.5940742492675781,
"ref_logps/chosen": -24.59956932067871,
"ref_logps/rejected": -26.790037155151367,
"rewards/accuracies": 0.65625,
"rewards/chosen": -0.3312861919403076,
"rewards/margins": 0.17520278692245483,
"rewards/rejected": -0.5064890384674072,
"step": 236
},
{
"epoch": 1.79,
"learning_rate": 2.2331460674157302e-07,
"logps/chosen": -27.303508758544922,
"logps/rejected": -37.65882110595703,
"loss": 0.5545,
"losses/dpo": 0.5936781764030457,
"losses/sft": 1.015429139137268,
"losses/total": 0.5936781764030457,
"ref_logps/chosen": -24.510639190673828,
"ref_logps/rejected": -30.55707550048828,
"rewards/accuracies": 0.796875,
"rewards/chosen": -0.27928683161735535,
"rewards/margins": 0.4308881163597107,
"rewards/rejected": -0.7101750373840332,
"step": 237
},
{
"epoch": 1.8,
"learning_rate": 2.2191011235955055e-07,
"logps/chosen": -24.99541473388672,
"logps/rejected": -30.256423950195312,
"loss": 0.6034,
"losses/dpo": 0.608791172504425,
"losses/sft": 0.9114975929260254,
"losses/total": 0.608791172504425,
"ref_logps/chosen": -22.079914093017578,
"ref_logps/rejected": -24.779722213745117,
"rewards/accuracies": 0.6953125,
"rewards/chosen": -0.2915502190589905,
"rewards/margins": 0.2561199963092804,
"rewards/rejected": -0.5476702451705933,
"step": 238
},
{
"epoch": 1.8,
"learning_rate": 2.205056179775281e-07,
"logps/chosen": -27.542556762695312,
"logps/rejected": -34.17859649658203,
"loss": 0.574,
"losses/dpo": 0.5037014484405518,
"losses/sft": 0.8922078609466553,
"losses/total": 0.5037014484405518,
"ref_logps/chosen": -24.774127960205078,
"ref_logps/rejected": -27.759735107421875,
"rewards/accuracies": 0.6875,
"rewards/chosen": -0.2768429219722748,
"rewards/margins": 0.3650434911251068,
"rewards/rejected": -0.6418864727020264,
"step": 239
},
{
"epoch": 1.81,
"learning_rate": 2.191011235955056e-07,
"logps/chosen": -25.87149429321289,
"logps/rejected": -34.46807861328125,
"loss": 0.6117,
"losses/dpo": 0.7050824165344238,
"losses/sft": 0.9497538208961487,
"losses/total": 0.7050824165344238,
"ref_logps/chosen": -23.14657211303711,
"ref_logps/rejected": -28.885162353515625,
"rewards/accuracies": 0.6875,
"rewards/chosen": -0.27249252796173096,
"rewards/margins": 0.2857990562915802,
"rewards/rejected": -0.5582915544509888,
"step": 240
},
{
"epoch": 1.82,
"learning_rate": 2.1769662921348314e-07,
"logps/chosen": -24.89635467529297,
"logps/rejected": -33.622718811035156,
"loss": 0.5808,
"losses/dpo": 0.5883455276489258,
"losses/sft": 0.9948925375938416,
"losses/total": 0.5883455276489258,
"ref_logps/chosen": -21.52194595336914,
"ref_logps/rejected": -26.93505859375,
"rewards/accuracies": 0.7265625,
"rewards/chosen": -0.3374406695365906,
"rewards/margins": 0.3313255310058594,
"rewards/rejected": -0.66876620054245,
"step": 241
},
{
"epoch": 1.83,
"learning_rate": 2.1629213483146067e-07,
"logps/chosen": -24.833309173583984,
"logps/rejected": -30.974327087402344,
"loss": 0.6136,
"losses/dpo": 0.6376237869262695,
"losses/sft": 0.9374114274978638,
"losses/total": 0.6376237869262695,
"ref_logps/chosen": -21.7708683013916,
"ref_logps/rejected": -25.24457359313965,
"rewards/accuracies": 0.65625,
"rewards/chosen": -0.30624428391456604,
"rewards/margins": 0.2667309045791626,
"rewards/rejected": -0.5729751586914062,
"step": 242
},
{
"epoch": 1.83,
"learning_rate": 2.148876404494382e-07,
"logps/chosen": -24.04471778869629,
"logps/rejected": -34.610633850097656,
"loss": 0.6133,
"losses/dpo": 0.645912766456604,
"losses/sft": 0.9913955926895142,
"losses/total": 0.645912766456604,
"ref_logps/chosen": -20.834651947021484,
"ref_logps/rejected": -28.77642059326172,
"rewards/accuracies": 0.6953125,
"rewards/chosen": -0.32100653648376465,
"rewards/margins": 0.26241475343704224,
"rewards/rejected": -0.5834212899208069,
"step": 243
},
{
"epoch": 1.84,
"learning_rate": 2.134831460674157e-07,
"logps/chosen": -26.419416427612305,
"logps/rejected": -34.56787109375,
"loss": 0.5713,
"losses/dpo": 0.6227866411209106,
"losses/sft": 0.9809292554855347,
"losses/total": 0.6227866411209106,
"ref_logps/chosen": -23.278644561767578,
"ref_logps/rejected": -27.596946716308594,
"rewards/accuracies": 0.7265625,
"rewards/chosen": -0.3140770494937897,
"rewards/margins": 0.38301563262939453,
"rewards/rejected": -0.6970926523208618,
"step": 244
},
{
"epoch": 1.85,
"learning_rate": 2.1207865168539323e-07,
"logps/chosen": -26.64739990234375,
"logps/rejected": -33.21559524536133,
"loss": 0.59,
"losses/dpo": 0.6351089477539062,
"losses/sft": 0.9912072420120239,
"losses/total": 0.6351089477539062,
"ref_logps/chosen": -23.281349182128906,
"ref_logps/rejected": -26.577198028564453,
"rewards/accuracies": 0.7109375,
"rewards/chosen": -0.3366050124168396,
"rewards/margins": 0.32723480463027954,
"rewards/rejected": -0.6638398170471191,
"step": 245
},
{
"epoch": 1.86,
"learning_rate": 2.1067415730337076e-07,
"logps/chosen": -27.422582626342773,
"logps/rejected": -35.08824920654297,
"loss": 0.6064,
"losses/dpo": 0.5233859419822693,
"losses/sft": 0.8136109709739685,
"losses/total": 0.5233859419822693,
"ref_logps/chosen": -24.274629592895508,
"ref_logps/rejected": -28.790220260620117,
"rewards/accuracies": 0.71875,
"rewards/chosen": -0.3147951364517212,
"rewards/margins": 0.3150079846382141,
"rewards/rejected": -0.6298030614852905,
"step": 246
},
{
"epoch": 1.86,
"learning_rate": 2.0926966292134832e-07,
"logps/chosen": -26.381507873535156,
"logps/rejected": -31.576181411743164,
"loss": 0.5829,
"losses/dpo": 0.5970532894134521,
"losses/sft": 0.8552703261375427,
"losses/total": 0.5970532894134521,
"ref_logps/chosen": -23.155136108398438,
"ref_logps/rejected": -24.94633674621582,
"rewards/accuracies": 0.65625,
"rewards/chosen": -0.32263678312301636,
"rewards/margins": 0.34034764766693115,
"rewards/rejected": -0.6629844903945923,
"step": 247
},
{
"epoch": 1.87,
"learning_rate": 2.0786516853932585e-07,
"logps/chosen": -24.061811447143555,
"logps/rejected": -29.508312225341797,
"loss": 0.6137,
"losses/dpo": 0.6248607039451599,
"losses/sft": 0.8072177767753601,
"losses/total": 0.6248607039451599,
"ref_logps/chosen": -20.497760772705078,
"ref_logps/rejected": -23.47817611694336,
"rewards/accuracies": 0.625,
"rewards/chosen": -0.35640496015548706,
"rewards/margins": 0.24660846590995789,
"rewards/rejected": -0.6030134558677673,
"step": 248
},
{
"epoch": 1.88,
"learning_rate": 2.0646067415730336e-07,
"logps/chosen": -29.165149688720703,
"logps/rejected": -35.16246032714844,
"loss": 0.5826,
"losses/dpo": 0.5271694660186768,
"losses/sft": 1.0120395421981812,
"losses/total": 0.5271694660186768,
"ref_logps/chosen": -25.856834411621094,
"ref_logps/rejected": -28.48740005493164,
"rewards/accuracies": 0.7421875,
"rewards/chosen": -0.3308315873146057,
"rewards/margins": 0.33667463064193726,
"rewards/rejected": -0.667506217956543,
"step": 249
},
{
"epoch": 1.89,
"learning_rate": 2.0505617977528089e-07,
"logps/chosen": -26.1055965423584,
"logps/rejected": -36.45195770263672,
"loss": 0.5345,
"losses/dpo": 0.5425952076911926,
"losses/sft": 0.9156839847564697,
"losses/total": 0.5425952076911926,
"ref_logps/chosen": -23.200654983520508,
"ref_logps/rejected": -28.661373138427734,
"rewards/accuracies": 0.7578125,
"rewards/chosen": -0.2904941737651825,
"rewards/margins": 0.48856407403945923,
"rewards/rejected": -0.7790582776069641,
"step": 250
},
{
"epoch": 1.89,
"learning_rate": 2.0365168539325842e-07,
"logps/chosen": -24.59746551513672,
"logps/rejected": -36.00947570800781,
"loss": 0.5622,
"losses/dpo": 0.6595858335494995,
"losses/sft": 0.8320033550262451,
"losses/total": 0.6595858335494995,
"ref_logps/chosen": -21.081745147705078,
"ref_logps/rejected": -28.422481536865234,
"rewards/accuracies": 0.703125,
"rewards/chosen": -0.3515721559524536,
"rewards/margins": 0.4071270823478699,
"rewards/rejected": -0.7586992383003235,
"step": 251
},
{
"epoch": 1.9,
"learning_rate": 2.0224719101123595e-07,
"logps/chosen": -25.407838821411133,
"logps/rejected": -33.07604217529297,
"loss": 0.5892,
"losses/dpo": 0.5324288606643677,
"losses/sft": 1.0311552286148071,
"losses/total": 0.5324288606643677,
"ref_logps/chosen": -22.188087463378906,
"ref_logps/rejected": -26.633270263671875,
"rewards/accuracies": 0.65625,
"rewards/chosen": -0.3219751715660095,
"rewards/margins": 0.322301983833313,
"rewards/rejected": -0.6442771553993225,
"step": 252
},
{
"epoch": 1.91,
"learning_rate": 2.0084269662921348e-07,
"logps/chosen": -26.190311431884766,
"logps/rejected": -33.34137725830078,
"loss": 0.5861,
"losses/dpo": 0.6612842082977295,
"losses/sft": 0.8551939129829407,
"losses/total": 0.6612842082977295,
"ref_logps/chosen": -22.73942756652832,
"ref_logps/rejected": -26.702760696411133,
"rewards/accuracies": 0.671875,
"rewards/chosen": -0.345088392496109,
"rewards/margins": 0.3187733292579651,
"rewards/rejected": -0.6638616919517517,
"step": 253
},
{
"epoch": 1.92,
"learning_rate": 1.9943820224719098e-07,
"logps/chosen": -27.615928649902344,
"logps/rejected": -33.776695251464844,
"loss": 0.5511,
"losses/dpo": 0.6082693338394165,
"losses/sft": 1.0973209142684937,
"losses/total": 0.6082693338394165,
"ref_logps/chosen": -24.300251007080078,
"ref_logps/rejected": -26.21588897705078,
"rewards/accuracies": 0.7578125,
"rewards/chosen": -0.3315678834915161,
"rewards/margins": 0.42451295256614685,
"rewards/rejected": -0.7560808658599854,
"step": 254
},
{
"epoch": 1.92,
"learning_rate": 1.9803370786516854e-07,
"logps/chosen": -28.257335662841797,
"logps/rejected": -35.67947769165039,
"loss": 0.5919,
"losses/dpo": 0.6389520168304443,
"losses/sft": 1.087360143661499,
"losses/total": 0.6389520168304443,
"ref_logps/chosen": -24.031015396118164,
"ref_logps/rejected": -28.11650276184082,
"rewards/accuracies": 0.6796875,
"rewards/chosen": -0.4226321578025818,
"rewards/margins": 0.33366525173187256,
"rewards/rejected": -0.7562973499298096,
"step": 255
},
{
"epoch": 1.93,
"learning_rate": 1.9662921348314607e-07,
"logps/chosen": -27.326435089111328,
"logps/rejected": -33.91853713989258,
"loss": 0.5884,
"losses/dpo": 0.5772832632064819,
"losses/sft": 1.0057258605957031,
"losses/total": 0.5772832632064819,
"ref_logps/chosen": -23.13665008544922,
"ref_logps/rejected": -26.448516845703125,
"rewards/accuracies": 0.6796875,
"rewards/chosen": -0.4189784526824951,
"rewards/margins": 0.32802364230155945,
"rewards/rejected": -0.747002124786377,
"step": 256
},
{
"epoch": 1.94,
"learning_rate": 1.952247191011236e-07,
"logps/chosen": -24.134462356567383,
"logps/rejected": -34.923095703125,
"loss": 0.5245,
"losses/dpo": 0.5826983451843262,
"losses/sft": 0.7670709490776062,
"losses/total": 0.5826983451843262,
"ref_logps/chosen": -21.35719108581543,
"ref_logps/rejected": -27.226768493652344,
"rewards/accuracies": 0.7578125,
"rewards/chosen": -0.2777270972728729,
"rewards/margins": 0.49190521240234375,
"rewards/rejected": -0.7696323394775391,
"step": 257
},
{
"epoch": 1.95,
"learning_rate": 1.938202247191011e-07,
"logps/chosen": -24.894744873046875,
"logps/rejected": -36.34782791137695,
"loss": 0.5654,
"losses/dpo": 0.5832593441009521,
"losses/sft": 0.8260340094566345,
"losses/total": 0.5832593441009521,
"ref_logps/chosen": -21.5096435546875,
"ref_logps/rejected": -28.905296325683594,
"rewards/accuracies": 0.7734375,
"rewards/chosen": -0.3385101854801178,
"rewards/margins": 0.4057431221008301,
"rewards/rejected": -0.7442533373832703,
"step": 258
},
{
"epoch": 1.95,
"learning_rate": 1.9241573033707863e-07,
"logps/chosen": -29.12051773071289,
"logps/rejected": -33.72222900390625,
"loss": 0.6189,
"losses/dpo": 0.5586456060409546,
"losses/sft": 1.1363164186477661,
"losses/total": 0.5586456060409546,
"ref_logps/chosen": -24.818958282470703,
"ref_logps/rejected": -26.637435913085938,
"rewards/accuracies": 0.6875,
"rewards/chosen": -0.430155873298645,
"rewards/margins": 0.27832359075546265,
"rewards/rejected": -0.7084795236587524,
"step": 259
},
{
"epoch": 1.96,
"learning_rate": 1.9101123595505617e-07,
"logps/chosen": -25.77654266357422,
"logps/rejected": -32.80144119262695,
"loss": 0.5647,
"losses/dpo": 0.6132915616035461,
"losses/sft": 0.8355939984321594,
"losses/total": 0.6132915616035461,
"ref_logps/chosen": -22.049232482910156,
"ref_logps/rejected": -25.218961715698242,
"rewards/accuracies": 0.765625,
"rewards/chosen": -0.37273097038269043,
"rewards/margins": 0.3855169415473938,
"rewards/rejected": -0.758247971534729,
"step": 260
},
{
"epoch": 1.97,
"learning_rate": 1.896067415730337e-07,
"logps/chosen": -27.173105239868164,
"logps/rejected": -33.18789291381836,
"loss": 0.5757,
"losses/dpo": 0.6402326822280884,
"losses/sft": 0.9358000159263611,
"losses/total": 0.6402326822280884,
"ref_logps/chosen": -24.05023956298828,
"ref_logps/rejected": -26.557050704956055,
"rewards/accuracies": 0.6796875,
"rewards/chosen": -0.3122865557670593,
"rewards/margins": 0.3507978618144989,
"rewards/rejected": -0.6630844473838806,
"step": 261
},
{
"epoch": 1.98,
"learning_rate": 1.8820224719101123e-07,
"logps/chosen": -25.127092361450195,
"logps/rejected": -34.0608024597168,
"loss": 0.5844,
"losses/dpo": 0.576771080493927,
"losses/sft": 0.8823024034500122,
"losses/total": 0.576771080493927,
"ref_logps/chosen": -21.552627563476562,
"ref_logps/rejected": -27.01715087890625,
"rewards/accuracies": 0.671875,
"rewards/chosen": -0.35744667053222656,
"rewards/margins": 0.346918523311615,
"rewards/rejected": -0.7043651938438416,
"step": 262
},
{
"epoch": 1.98,
"learning_rate": 1.8679775280898876e-07,
"logps/chosen": -25.840179443359375,
"logps/rejected": -34.11262893676758,
"loss": 0.5675,
"losses/dpo": 0.5643225312232971,
"losses/sft": 0.7924672365188599,
"losses/total": 0.5643225312232971,
"ref_logps/chosen": -22.366439819335938,
"ref_logps/rejected": -26.873088836669922,
"rewards/accuracies": 0.7109375,
"rewards/chosen": -0.3473738133907318,
"rewards/margins": 0.37658050656318665,
"rewards/rejected": -0.7239543199539185,
"step": 263
},
{
"epoch": 1.99,
"learning_rate": 1.853932584269663e-07,
"logps/chosen": -24.64289093017578,
"logps/rejected": -30.944011688232422,
"loss": 0.5768,
"losses/dpo": 0.6149911880493164,
"losses/sft": 0.9512190222740173,
"losses/total": 0.6149911880493164,
"ref_logps/chosen": -21.270837783813477,
"ref_logps/rejected": -24.173620223999023,
"rewards/accuracies": 0.734375,
"rewards/chosen": -0.33720535039901733,
"rewards/margins": 0.3398338854312897,
"rewards/rejected": -0.6770392656326294,
"step": 264
},
{
"epoch": 2.0,
"learning_rate": 1.8398876404494382e-07,
"logps/chosen": -26.05956268310547,
"logps/rejected": -35.905609130859375,
"loss": 0.5407,
"losses/dpo": 0.49823397397994995,
"losses/sft": 0.8145182132720947,
"losses/total": 0.49823397397994995,
"ref_logps/chosen": -23.054027557373047,
"ref_logps/rejected": -28.27884864807129,
"rewards/accuracies": 0.796875,
"rewards/chosen": -0.3005535304546356,
"rewards/margins": 0.4621226191520691,
"rewards/rejected": -0.7626761794090271,
"step": 265
},
{
"epoch": 2.01,
"learning_rate": 1.8258426966292135e-07,
"logps/chosen": -24.907108306884766,
"logps/rejected": -33.71357345581055,
"loss": 0.5301,
"losses/dpo": 0.49924543499946594,
"losses/sft": 0.9444026350975037,
"losses/total": 0.49924543499946594,
"ref_logps/chosen": -21.562763214111328,
"ref_logps/rejected": -25.69823455810547,
"rewards/accuracies": 0.7578125,
"rewards/chosen": -0.3344343304634094,
"rewards/margins": 0.4670996069908142,
"rewards/rejected": -0.8015338778495789,
"step": 266
},
{
"epoch": 2.02,
"learning_rate": 1.8117977528089888e-07,
"logps/chosen": -24.61281967163086,
"logps/rejected": -33.178436279296875,
"loss": 0.5843,
"losses/dpo": 0.6827691793441772,
"losses/sft": 0.9820384979248047,
"losses/total": 0.6827691793441772,
"ref_logps/chosen": -20.769065856933594,
"ref_logps/rejected": -25.740190505981445,
"rewards/accuracies": 0.6953125,
"rewards/chosen": -0.3843753933906555,
"rewards/margins": 0.35944926738739014,
"rewards/rejected": -0.7438246607780457,
"step": 267
},
{
"epoch": 2.02,
"learning_rate": 1.7977528089887638e-07,
"logps/chosen": -25.742042541503906,
"logps/rejected": -31.92254638671875,
"loss": 0.5765,
"losses/dpo": 0.48391562700271606,
"losses/sft": 0.9694733619689941,
"losses/total": 0.48391562700271606,
"ref_logps/chosen": -22.27023696899414,
"ref_logps/rejected": -24.869842529296875,
"rewards/accuracies": 0.734375,
"rewards/chosen": -0.34718072414398193,
"rewards/margins": 0.3580899238586426,
"rewards/rejected": -0.7052706480026245,
"step": 268
},
{
"epoch": 2.03,
"learning_rate": 1.7837078651685391e-07,
"logps/chosen": -24.76668930053711,
"logps/rejected": -33.92596435546875,
"loss": 0.5197,
"losses/dpo": 0.566383957862854,
"losses/sft": 1.056198239326477,
"losses/total": 0.566383957862854,
"ref_logps/chosen": -21.79462432861328,
"ref_logps/rejected": -26.1394100189209,
"rewards/accuracies": 0.796875,
"rewards/chosen": -0.2972065806388855,
"rewards/margins": 0.48144853115081787,
"rewards/rejected": -0.7786551713943481,
"step": 269
},
{
"epoch": 2.04,
"learning_rate": 1.7696629213483144e-07,
"logps/chosen": -25.022621154785156,
"logps/rejected": -32.83625030517578,
"loss": 0.554,
"losses/dpo": 0.5455434322357178,
"losses/sft": 0.9091237783432007,
"losses/total": 0.5455434322357178,
"ref_logps/chosen": -21.205692291259766,
"ref_logps/rejected": -24.853519439697266,
"rewards/accuracies": 0.765625,
"rewards/chosen": -0.3816927969455719,
"rewards/margins": 0.4165803790092468,
"rewards/rejected": -0.7982731461524963,
"step": 270
},
{
"epoch": 2.05,
"learning_rate": 1.75561797752809e-07,
"logps/chosen": -27.038639068603516,
"logps/rejected": -35.007415771484375,
"loss": 0.5526,
"losses/dpo": 0.7876778841018677,
"losses/sft": 1.1023296117782593,
"losses/total": 0.7876778841018677,
"ref_logps/chosen": -23.039878845214844,
"ref_logps/rejected": -26.884708404541016,
"rewards/accuracies": 0.78125,
"rewards/chosen": -0.39987578988075256,
"rewards/margins": 0.41239458322525024,
"rewards/rejected": -0.8122704029083252,
"step": 271
},
{
"epoch": 2.05,
"learning_rate": 1.741573033707865e-07,
"logps/chosen": -26.300579071044922,
"logps/rejected": -36.53676223754883,
"loss": 0.5444,
"losses/dpo": 0.4805631637573242,
"losses/sft": 0.8787716031074524,
"losses/total": 0.4805631637573242,
"ref_logps/chosen": -22.55372428894043,
"ref_logps/rejected": -27.711929321289062,
"rewards/accuracies": 0.7578125,
"rewards/chosen": -0.3746855556964874,
"rewards/margins": 0.5077978372573853,
"rewards/rejected": -0.8824833631515503,
"step": 272
},
{
"epoch": 2.06,
"learning_rate": 1.7275280898876404e-07,
"logps/chosen": -27.612911224365234,
"logps/rejected": -39.48854064941406,
"loss": 0.4883,
"losses/dpo": 0.5499591827392578,
"losses/sft": 1.1995720863342285,
"losses/total": 0.5499591827392578,
"ref_logps/chosen": -23.866586685180664,
"ref_logps/rejected": -29.748516082763672,
"rewards/accuracies": 0.828125,
"rewards/chosen": -0.37463241815567017,
"rewards/margins": 0.5993699431419373,
"rewards/rejected": -0.9740023612976074,
"step": 273
},
{
"epoch": 2.07,
"learning_rate": 1.7134831460674157e-07,
"logps/chosen": -28.848485946655273,
"logps/rejected": -38.46211242675781,
"loss": 0.5223,
"losses/dpo": 0.5853086113929749,
"losses/sft": 0.9450937509536743,
"losses/total": 0.5853086113929749,
"ref_logps/chosen": -24.71368980407715,
"ref_logps/rejected": -29.095096588134766,
"rewards/accuracies": 0.796875,
"rewards/chosen": -0.4134795069694519,
"rewards/margins": 0.5232220888137817,
"rewards/rejected": -0.9367015957832336,
"step": 274
},
{
"epoch": 2.08,
"learning_rate": 1.699438202247191e-07,
"logps/chosen": -26.53584861755371,
"logps/rejected": -33.2642707824707,
"loss": 0.5583,
"losses/dpo": 0.6550332307815552,
"losses/sft": 0.844421923160553,
"losses/total": 0.6550332307815552,
"ref_logps/chosen": -22.528621673583984,
"ref_logps/rejected": -25.07666778564453,
"rewards/accuracies": 0.765625,
"rewards/chosen": -0.4007226824760437,
"rewards/margins": 0.4180375933647156,
"rewards/rejected": -0.8187602758407593,
"step": 275
},
{
"epoch": 2.08,
"learning_rate": 1.6853932584269663e-07,
"logps/chosen": -26.93305778503418,
"logps/rejected": -36.43919372558594,
"loss": 0.5267,
"losses/dpo": 0.37509262561798096,
"losses/sft": 0.9286944270133972,
"losses/total": 0.37509262561798096,
"ref_logps/chosen": -22.965662002563477,
"ref_logps/rejected": -27.516300201416016,
"rewards/accuracies": 0.765625,
"rewards/chosen": -0.39673954248428345,
"rewards/margins": 0.4955495595932007,
"rewards/rejected": -0.8922891616821289,
"step": 276
},
{
"epoch": 2.09,
"learning_rate": 1.6713483146067413e-07,
"logps/chosen": -27.517230987548828,
"logps/rejected": -33.23160934448242,
"loss": 0.585,
"losses/dpo": 0.45891374349594116,
"losses/sft": 0.8818660378456116,
"losses/total": 0.45891374349594116,
"ref_logps/chosen": -23.37508773803711,
"ref_logps/rejected": -25.319503784179688,
"rewards/accuracies": 0.6875,
"rewards/chosen": -0.4142143726348877,
"rewards/margins": 0.37699633836746216,
"rewards/rejected": -0.7912107110023499,
"step": 277
},
{
"epoch": 2.1,
"learning_rate": 1.6573033707865166e-07,
"logps/chosen": -29.848949432373047,
"logps/rejected": -37.009605407714844,
"loss": 0.5569,
"losses/dpo": 0.6695871353149414,
"losses/sft": 1.1478632688522339,
"losses/total": 0.6695871353149414,
"ref_logps/chosen": -25.79513168334961,
"ref_logps/rejected": -28.52492332458496,
"rewards/accuracies": 0.7421875,
"rewards/chosen": -0.4053817391395569,
"rewards/margins": 0.4430864751338959,
"rewards/rejected": -0.8484681844711304,
"step": 278
},
{
"epoch": 2.11,
"learning_rate": 1.6432584269662922e-07,
"logps/chosen": -26.847461700439453,
"logps/rejected": -33.84664535522461,
"loss": 0.5853,
"losses/dpo": 0.6266674995422363,
"losses/sft": 0.9419240951538086,
"losses/total": 0.6266674995422363,
"ref_logps/chosen": -23.1273193359375,
"ref_logps/rejected": -26.497238159179688,
"rewards/accuracies": 0.671875,
"rewards/chosen": -0.3720143437385559,
"rewards/margins": 0.3629264533519745,
"rewards/rejected": -0.734940767288208,
"step": 279
},
{
"epoch": 2.11,
"learning_rate": 1.6292134831460675e-07,
"logps/chosen": -25.227951049804688,
"logps/rejected": -37.79768371582031,
"loss": 0.5277,
"losses/dpo": 0.5965819358825684,
"losses/sft": 1.0364360809326172,
"losses/total": 0.5965819358825684,
"ref_logps/chosen": -20.85896873474121,
"ref_logps/rejected": -28.49602508544922,
"rewards/accuracies": 0.7890625,
"rewards/chosen": -0.43689805269241333,
"rewards/margins": 0.49326756596565247,
"rewards/rejected": -0.9301656484603882,
"step": 280
},
{
"epoch": 2.12,
"learning_rate": 1.6151685393258428e-07,
"logps/chosen": -25.945070266723633,
"logps/rejected": -35.28973388671875,
"loss": 0.5305,
"losses/dpo": 0.5456879138946533,
"losses/sft": 0.8692267537117004,
"losses/total": 0.5456879138946533,
"ref_logps/chosen": -22.303909301757812,
"ref_logps/rejected": -26.77994155883789,
"rewards/accuracies": 0.75,
"rewards/chosen": -0.36411628127098083,
"rewards/margins": 0.4868628680706024,
"rewards/rejected": -0.850979208946228,
"step": 281
},
{
"epoch": 2.13,
"learning_rate": 1.6011235955056178e-07,
"logps/chosen": -28.660266876220703,
"logps/rejected": -36.41142272949219,
"loss": 0.5766,
"losses/dpo": 0.6054384708404541,
"losses/sft": 0.9599564671516418,
"losses/total": 0.6054384708404541,
"ref_logps/chosen": -24.156015396118164,
"ref_logps/rejected": -28.24047088623047,
"rewards/accuracies": 0.734375,
"rewards/chosen": -0.4504254460334778,
"rewards/margins": 0.36666956543922424,
"rewards/rejected": -0.8170950412750244,
"step": 282
},
{
"epoch": 2.14,
"learning_rate": 1.5870786516853931e-07,
"logps/chosen": -27.74228858947754,
"logps/rejected": -38.50691604614258,
"loss": 0.5215,
"losses/dpo": 0.463223397731781,
"losses/sft": 1.041387915611267,
"losses/total": 0.463223397731781,
"ref_logps/chosen": -23.603931427001953,
"ref_logps/rejected": -28.803909301757812,
"rewards/accuracies": 0.75,
"rewards/chosen": -0.4138358533382416,
"rewards/margins": 0.5564644932746887,
"rewards/rejected": -0.9703004360198975,
"step": 283
},
{
"epoch": 2.14,
"learning_rate": 1.5730337078651685e-07,
"logps/chosen": -24.93131446838379,
"logps/rejected": -35.91729736328125,
"loss": 0.5266,
"losses/dpo": 0.6279169321060181,
"losses/sft": 0.8709256052970886,
"losses/total": 0.6279169321060181,
"ref_logps/chosen": -20.768774032592773,
"ref_logps/rejected": -26.83118438720703,
"rewards/accuracies": 0.7734375,
"rewards/chosen": -0.4162542223930359,
"rewards/margins": 0.4923573136329651,
"rewards/rejected": -0.9086115956306458,
"step": 284
},
{
"epoch": 2.15,
"learning_rate": 1.5589887640449438e-07,
"logps/chosen": -27.571338653564453,
"logps/rejected": -38.57915496826172,
"loss": 0.5687,
"losses/dpo": 0.5966840386390686,
"losses/sft": 0.9412966966629028,
"losses/total": 0.5966840386390686,
"ref_logps/chosen": -22.682205200195312,
"ref_logps/rejected": -29.58980369567871,
"rewards/accuracies": 0.7265625,
"rewards/chosen": -0.48891347646713257,
"rewards/margins": 0.4100216031074524,
"rewards/rejected": -0.898935079574585,
"step": 285
},
{
"epoch": 2.16,
"learning_rate": 1.5449438202247188e-07,
"logps/chosen": -25.46674346923828,
"logps/rejected": -33.395118713378906,
"loss": 0.5902,
"losses/dpo": 0.7199227213859558,
"losses/sft": 0.9989073276519775,
"losses/total": 0.7199227213859558,
"ref_logps/chosen": -21.129976272583008,
"ref_logps/rejected": -25.61261749267578,
"rewards/accuracies": 0.6875,
"rewards/chosen": -0.4336766302585602,
"rewards/margins": 0.3445735573768616,
"rewards/rejected": -0.7782501578330994,
"step": 286
},
{
"epoch": 2.17,
"learning_rate": 1.5308988764044944e-07,
"logps/chosen": -26.559568405151367,
"logps/rejected": -39.50514221191406,
"loss": 0.5101,
"losses/dpo": 0.42156773805618286,
"losses/sft": 0.824786365032196,
"losses/total": 0.42156773805618286,
"ref_logps/chosen": -21.88895606994629,
"ref_logps/rejected": -29.256093978881836,
"rewards/accuracies": 0.8046875,
"rewards/chosen": -0.46706122159957886,
"rewards/margins": 0.557843804359436,
"rewards/rejected": -1.0249050855636597,
"step": 287
},
{
"epoch": 2.17,
"learning_rate": 1.5168539325842697e-07,
"logps/chosen": -27.430574417114258,
"logps/rejected": -35.846214294433594,
"loss": 0.5852,
"losses/dpo": 0.7073966264724731,
"losses/sft": 0.959773600101471,
"losses/total": 0.7073966264724731,
"ref_logps/chosen": -22.33085060119629,
"ref_logps/rejected": -26.85816192626953,
"rewards/accuracies": 0.7109375,
"rewards/chosen": -0.509972333908081,
"rewards/margins": 0.38883259892463684,
"rewards/rejected": -0.8988049030303955,
"step": 288
},
{
"epoch": 2.18,
"learning_rate": 1.502808988764045e-07,
"logps/chosen": -25.799861907958984,
"logps/rejected": -37.50861358642578,
"loss": 0.5553,
"losses/dpo": 0.5419721603393555,
"losses/sft": 0.940202534198761,
"losses/total": 0.5419721603393555,
"ref_logps/chosen": -21.224933624267578,
"ref_logps/rejected": -28.002174377441406,
"rewards/accuracies": 0.6875,
"rewards/chosen": -0.4574929475784302,
"rewards/margins": 0.4931509494781494,
"rewards/rejected": -0.9506438970565796,
"step": 289
},
{
"epoch": 2.19,
"learning_rate": 1.4887640449438203e-07,
"logps/chosen": -27.79110336303711,
"logps/rejected": -34.21430206298828,
"loss": 0.5921,
"losses/dpo": 0.6595107913017273,
"losses/sft": 1.0057413578033447,
"losses/total": 0.6595107913017273,
"ref_logps/chosen": -23.073078155517578,
"ref_logps/rejected": -26.093578338623047,
"rewards/accuracies": 0.71875,
"rewards/chosen": -0.4718025326728821,
"rewards/margins": 0.34026968479156494,
"rewards/rejected": -0.8120721578598022,
"step": 290
},
{
"epoch": 2.2,
"learning_rate": 1.4747191011235953e-07,
"logps/chosen": -26.380355834960938,
"logps/rejected": -37.56932830810547,
"loss": 0.5263,
"losses/dpo": 0.47728973627090454,
"losses/sft": 1.0133030414581299,
"losses/total": 0.47728973627090454,
"ref_logps/chosen": -22.488906860351562,
"ref_logps/rejected": -28.34372329711914,
"rewards/accuracies": 0.71875,
"rewards/chosen": -0.38914480805397034,
"rewards/margins": 0.5334160327911377,
"rewards/rejected": -0.9225608110427856,
"step": 291
},
{
"epoch": 2.2,
"learning_rate": 1.4606741573033706e-07,
"logps/chosen": -27.006134033203125,
"logps/rejected": -37.092594146728516,
"loss": 0.5417,
"losses/dpo": 0.7257384061813354,
"losses/sft": 1.2120591402053833,
"losses/total": 0.7257384061813354,
"ref_logps/chosen": -22.130794525146484,
"ref_logps/rejected": -27.460662841796875,
"rewards/accuracies": 0.75,
"rewards/chosen": -0.48753368854522705,
"rewards/margins": 0.47565943002700806,
"rewards/rejected": -0.9631930589675903,
"step": 292
},
{
"epoch": 2.21,
"learning_rate": 1.446629213483146e-07,
"logps/chosen": -25.57880210876465,
"logps/rejected": -37.16014099121094,
"loss": 0.5381,
"losses/dpo": 0.6313049793243408,
"losses/sft": 0.9201721549034119,
"losses/total": 0.6313049793243408,
"ref_logps/chosen": -21.904296875,
"ref_logps/rejected": -28.32394027709961,
"rewards/accuracies": 0.6953125,
"rewards/chosen": -0.3674505054950714,
"rewards/margins": 0.5161697268486023,
"rewards/rejected": -0.8836201429367065,
"step": 293
},
{
"epoch": 2.22,
"learning_rate": 1.4325842696629212e-07,
"logps/chosen": -24.601829528808594,
"logps/rejected": -37.283538818359375,
"loss": 0.5429,
"losses/dpo": 0.5216307044029236,
"losses/sft": 1.0138075351715088,
"losses/total": 0.5216307044029236,
"ref_logps/chosen": -20.36395263671875,
"ref_logps/rejected": -28.042282104492188,
"rewards/accuracies": 0.7734375,
"rewards/chosen": -0.4237874746322632,
"rewards/margins": 0.5003381967544556,
"rewards/rejected": -0.9241256713867188,
"step": 294
},
{
"epoch": 2.23,
"learning_rate": 1.4185393258426968e-07,
"logps/chosen": -28.58258819580078,
"logps/rejected": -36.47199249267578,
"loss": 0.5894,
"losses/dpo": 0.43838924169540405,
"losses/sft": 1.2099077701568604,
"losses/total": 0.43838924169540405,
"ref_logps/chosen": -23.226455688476562,
"ref_logps/rejected": -26.880578994750977,
"rewards/accuracies": 0.6953125,
"rewards/chosen": -0.5356131792068481,
"rewards/margins": 0.423528254032135,
"rewards/rejected": -0.9591414332389832,
"step": 295
},
{
"epoch": 2.23,
"learning_rate": 1.4044943820224718e-07,
"logps/chosen": -28.39300537109375,
"logps/rejected": -36.5651741027832,
"loss": 0.5472,
"losses/dpo": 0.6286274790763855,
"losses/sft": 1.1655751466751099,
"losses/total": 0.6286274790763855,
"ref_logps/chosen": -23.700767517089844,
"ref_logps/rejected": -26.989849090576172,
"rewards/accuracies": 0.7265625,
"rewards/chosen": -0.4692240357398987,
"rewards/margins": 0.48830845952033997,
"rewards/rejected": -0.9575324058532715,
"step": 296
},
{
"epoch": 2.24,
"learning_rate": 1.3904494382022472e-07,
"logps/chosen": -27.01761245727539,
"logps/rejected": -34.147830963134766,
"loss": 0.5581,
"losses/dpo": 0.425361692905426,
"losses/sft": 1.129596471786499,
"losses/total": 0.425361692905426,
"ref_logps/chosen": -22.251991271972656,
"ref_logps/rejected": -24.953710556030273,
"rewards/accuracies": 0.671875,
"rewards/chosen": -0.47656214237213135,
"rewards/margins": 0.4428498148918152,
"rewards/rejected": -0.9194119572639465,
"step": 297
},
{
"epoch": 2.25,
"learning_rate": 1.3764044943820225e-07,
"logps/chosen": -27.92938995361328,
"logps/rejected": -39.81676483154297,
"loss": 0.5111,
"losses/dpo": 0.5578055381774902,
"losses/sft": 1.1197444200515747,
"losses/total": 0.5578055381774902,
"ref_logps/chosen": -23.314014434814453,
"ref_logps/rejected": -29.507539749145508,
"rewards/accuracies": 0.78125,
"rewards/chosen": -0.4615376591682434,
"rewards/margins": 0.5693849325180054,
"rewards/rejected": -1.0309226512908936,
"step": 298
},
{
"epoch": 2.26,
"learning_rate": 1.3623595505617978e-07,
"logps/chosen": -27.910144805908203,
"logps/rejected": -35.69133758544922,
"loss": 0.5499,
"losses/dpo": 0.4847102165222168,
"losses/sft": 0.989621639251709,
"losses/total": 0.4847102165222168,
"ref_logps/chosen": -23.326908111572266,
"ref_logps/rejected": -26.762676239013672,
"rewards/accuracies": 0.734375,
"rewards/chosen": -0.45832377672195435,
"rewards/margins": 0.43454277515411377,
"rewards/rejected": -0.8928664922714233,
"step": 299
},
{
"epoch": 2.26,
"learning_rate": 1.3483146067415728e-07,
"logps/chosen": -28.233020782470703,
"logps/rejected": -37.3542366027832,
"loss": 0.5935,
"losses/dpo": 0.5905570983886719,
"losses/sft": 1.0464057922363281,
"losses/total": 0.5905570983886719,
"ref_logps/chosen": -23.20905303955078,
"ref_logps/rejected": -28.49638557434082,
"rewards/accuracies": 0.71875,
"rewards/chosen": -0.5023964643478394,
"rewards/margins": 0.38338857889175415,
"rewards/rejected": -0.8857850432395935,
"step": 300
},
{
"epoch": 2.27,
"learning_rate": 1.334269662921348e-07,
"logps/chosen": -29.44438934326172,
"logps/rejected": -36.25569152832031,
"loss": 0.5608,
"losses/dpo": 0.5518324375152588,
"losses/sft": 0.9761526584625244,
"losses/total": 0.5518324375152588,
"ref_logps/chosen": -24.553022384643555,
"ref_logps/rejected": -26.846464157104492,
"rewards/accuracies": 0.7109375,
"rewards/chosen": -0.4891367554664612,
"rewards/margins": 0.451786071062088,
"rewards/rejected": -0.9409228563308716,
"step": 301
},
{
"epoch": 2.28,
"learning_rate": 1.3202247191011234e-07,
"logps/chosen": -29.32979393005371,
"logps/rejected": -37.83529281616211,
"loss": 0.5463,
"losses/dpo": 0.5125599503517151,
"losses/sft": 0.9747940897941589,
"losses/total": 0.5125599503517151,
"ref_logps/chosen": -24.211776733398438,
"ref_logps/rejected": -27.92011260986328,
"rewards/accuracies": 0.734375,
"rewards/chosen": -0.5118017196655273,
"rewards/margins": 0.47971609234809875,
"rewards/rejected": -0.9915178418159485,
"step": 302
},
{
"epoch": 2.29,
"learning_rate": 1.306179775280899e-07,
"logps/chosen": -27.11697769165039,
"logps/rejected": -35.418338775634766,
"loss": 0.575,
"losses/dpo": 0.5703020095825195,
"losses/sft": 0.9395530223846436,
"losses/total": 0.5703020095825195,
"ref_logps/chosen": -22.12503433227539,
"ref_logps/rejected": -26.28829574584961,
"rewards/accuracies": 0.7265625,
"rewards/chosen": -0.49919426441192627,
"rewards/margins": 0.4138101041316986,
"rewards/rejected": -0.9130042791366577,
"step": 303
},
{
"epoch": 2.29,
"learning_rate": 1.2921348314606743e-07,
"logps/chosen": -31.24747085571289,
"logps/rejected": -38.8961181640625,
"loss": 0.5843,
"losses/dpo": 0.4914831221103668,
"losses/sft": 0.9517439603805542,
"losses/total": 0.4914831221103668,
"ref_logps/chosen": -25.351207733154297,
"ref_logps/rejected": -29.048057556152344,
"rewards/accuracies": 0.734375,
"rewards/chosen": -0.5896263122558594,
"rewards/margins": 0.3951793909072876,
"rewards/rejected": -0.9848057627677917,
"step": 304
},
{
"epoch": 2.3,
"learning_rate": 1.2780898876404493e-07,
"logps/chosen": -27.757152557373047,
"logps/rejected": -37.481964111328125,
"loss": 0.5261,
"losses/dpo": 0.4620394706726074,
"losses/sft": 1.0134756565093994,
"losses/total": 0.4620394706726074,
"ref_logps/chosen": -23.13431167602539,
"ref_logps/rejected": -27.492977142333984,
"rewards/accuracies": 0.75,
"rewards/chosen": -0.4622839689254761,
"rewards/margins": 0.5366144776344299,
"rewards/rejected": -0.9988985061645508,
"step": 305
},
{
"epoch": 2.31,
"learning_rate": 1.2640449438202246e-07,
"logps/chosen": -26.990705490112305,
"logps/rejected": -34.95043182373047,
"loss": 0.5636,
"losses/dpo": 0.5714951753616333,
"losses/sft": 0.9859296679496765,
"losses/total": 0.5714951753616333,
"ref_logps/chosen": -22.465744018554688,
"ref_logps/rejected": -26.120864868164062,
"rewards/accuracies": 0.7421875,
"rewards/chosen": -0.452495813369751,
"rewards/margins": 0.4304611086845398,
"rewards/rejected": -0.882956862449646,
"step": 306
},
{
"epoch": 2.32,
"learning_rate": 1.25e-07,
"logps/chosen": -29.839576721191406,
"logps/rejected": -40.363712310791016,
"loss": 0.5508,
"losses/dpo": 0.5849748253822327,
"losses/sft": 0.9925932288169861,
"losses/total": 0.5849748253822327,
"ref_logps/chosen": -24.343534469604492,
"ref_logps/rejected": -29.49897003173828,
"rewards/accuracies": 0.75,
"rewards/chosen": -0.5496042370796204,
"rewards/margins": 0.5368699431419373,
"rewards/rejected": -1.0864741802215576,
"step": 307
},
{
"epoch": 2.32,
"learning_rate": 1.2359550561797752e-07,
"logps/chosen": -26.925609588623047,
"logps/rejected": -35.485931396484375,
"loss": 0.5537,
"losses/dpo": 0.43898260593414307,
"losses/sft": 0.8520787954330444,
"losses/total": 0.43898260593414307,
"ref_logps/chosen": -22.369102478027344,
"ref_logps/rejected": -26.426111221313477,
"rewards/accuracies": 0.7734375,
"rewards/chosen": -0.4556504487991333,
"rewards/margins": 0.450331449508667,
"rewards/rejected": -0.9059818387031555,
"step": 308
},
{
"epoch": 2.33,
"learning_rate": 1.2219101123595506e-07,
"logps/chosen": -27.308425903320312,
"logps/rejected": -34.86455535888672,
"loss": 0.6099,
"losses/dpo": 0.6877168416976929,
"losses/sft": 0.8925371766090393,
"losses/total": 0.6877168416976929,
"ref_logps/chosen": -22.015674591064453,
"ref_logps/rejected": -26.024978637695312,
"rewards/accuracies": 0.6796875,
"rewards/chosen": -0.5292750597000122,
"rewards/margins": 0.3546826243400574,
"rewards/rejected": -0.8839576840400696,
"step": 309
},
{
"epoch": 2.34,
"learning_rate": 1.2078651685393259e-07,
"logps/chosen": -27.23873519897461,
"logps/rejected": -34.355613708496094,
"loss": 0.5451,
"losses/dpo": 0.4608323574066162,
"losses/sft": 1.068372130393982,
"losses/total": 0.4608323574066162,
"ref_logps/chosen": -22.93021011352539,
"ref_logps/rejected": -25.29645538330078,
"rewards/accuracies": 0.7421875,
"rewards/chosen": -0.4308522939682007,
"rewards/margins": 0.4750638008117676,
"rewards/rejected": -0.9059160947799683,
"step": 310
},
{
"epoch": 2.35,
"learning_rate": 1.1938202247191012e-07,
"logps/chosen": -27.410297393798828,
"logps/rejected": -36.60034942626953,
"loss": 0.5435,
"losses/dpo": 0.49991002678871155,
"losses/sft": 0.9416501522064209,
"losses/total": 0.49991002678871155,
"ref_logps/chosen": -22.95693016052246,
"ref_logps/rejected": -27.357826232910156,
"rewards/accuracies": 0.765625,
"rewards/chosen": -0.4453369379043579,
"rewards/margins": 0.47891533374786377,
"rewards/rejected": -0.9242523312568665,
"step": 311
},
{
"epoch": 2.35,
"learning_rate": 1.1797752808988763e-07,
"logps/chosen": -26.075031280517578,
"logps/rejected": -35.989906311035156,
"loss": 0.5108,
"losses/dpo": 0.49980589747428894,
"losses/sft": 0.8830540776252747,
"losses/total": 0.49980589747428894,
"ref_logps/chosen": -21.97342300415039,
"ref_logps/rejected": -26.20843505859375,
"rewards/accuracies": 0.78125,
"rewards/chosen": -0.4101608395576477,
"rewards/margins": 0.5679866075515747,
"rewards/rejected": -0.9781473875045776,
"step": 312
},
{
"epoch": 2.36,
"learning_rate": 1.1657303370786515e-07,
"logps/chosen": -27.034866333007812,
"logps/rejected": -36.756141662597656,
"loss": 0.535,
"losses/dpo": 0.5506036281585693,
"losses/sft": 0.842628002166748,
"losses/total": 0.5506036281585693,
"ref_logps/chosen": -22.406818389892578,
"ref_logps/rejected": -26.883403778076172,
"rewards/accuracies": 0.7734375,
"rewards/chosen": -0.4628047049045563,
"rewards/margins": 0.5244689583778381,
"rewards/rejected": -0.9872736930847168,
"step": 313
},
{
"epoch": 2.37,
"learning_rate": 1.151685393258427e-07,
"logps/chosen": -24.555809020996094,
"logps/rejected": -32.58570861816406,
"loss": 0.6002,
"losses/dpo": 0.643724262714386,
"losses/sft": 0.86636883020401,
"losses/total": 0.643724262714386,
"ref_logps/chosen": -20.17746353149414,
"ref_logps/rejected": -24.76491928100586,
"rewards/accuracies": 0.6640625,
"rewards/chosen": -0.4378345012664795,
"rewards/margins": 0.3442443907260895,
"rewards/rejected": -0.7820788621902466,
"step": 314
},
{
"epoch": 2.38,
"learning_rate": 1.1376404494382023e-07,
"logps/chosen": -23.88672637939453,
"logps/rejected": -34.9397087097168,
"loss": 0.529,
"losses/dpo": 0.5676740407943726,
"losses/sft": 0.8977797627449036,
"losses/total": 0.5676740407943726,
"ref_logps/chosen": -19.26026153564453,
"ref_logps/rejected": -25.219451904296875,
"rewards/accuracies": 0.7578125,
"rewards/chosen": -0.4626464247703552,
"rewards/margins": 0.5093792676925659,
"rewards/rejected": -0.9720257520675659,
"step": 315
},
{
"epoch": 2.38,
"learning_rate": 1.1235955056179774e-07,
"logps/chosen": -28.308603286743164,
"logps/rejected": -38.62759017944336,
"loss": 0.5376,
"losses/dpo": 0.4975647032260895,
"losses/sft": 1.098832130432129,
"losses/total": 0.4975647032260895,
"ref_logps/chosen": -22.983192443847656,
"ref_logps/rejected": -28.174081802368164,
"rewards/accuracies": 0.75,
"rewards/chosen": -0.5325409173965454,
"rewards/margins": 0.5128099918365479,
"rewards/rejected": -1.0453509092330933,
"step": 316
},
{
"epoch": 2.39,
"learning_rate": 1.1095505617977527e-07,
"logps/chosen": -29.64281463623047,
"logps/rejected": -34.92308807373047,
"loss": 0.6085,
"losses/dpo": 0.6205140352249146,
"losses/sft": 1.0714130401611328,
"losses/total": 0.6205140352249146,
"ref_logps/chosen": -23.70892333984375,
"ref_logps/rejected": -25.790882110595703,
"rewards/accuracies": 0.6875,
"rewards/chosen": -0.5933888554573059,
"rewards/margins": 0.31983205676078796,
"rewards/rejected": -0.9132209420204163,
"step": 317
},
{
"epoch": 2.4,
"learning_rate": 1.095505617977528e-07,
"logps/chosen": -27.25480079650879,
"logps/rejected": -35.160831451416016,
"loss": 0.5594,
"losses/dpo": 0.46756136417388916,
"losses/sft": 1.0184146165847778,
"losses/total": 0.46756136417388916,
"ref_logps/chosen": -22.160459518432617,
"ref_logps/rejected": -25.47940444946289,
"rewards/accuracies": 0.7109375,
"rewards/chosen": -0.5094340443611145,
"rewards/margins": 0.45870864391326904,
"rewards/rejected": -0.9681426882743835,
"step": 318
},
{
"epoch": 2.41,
"learning_rate": 1.0814606741573033e-07,
"logps/chosen": -28.060367584228516,
"logps/rejected": -35.89076232910156,
"loss": 0.5957,
"losses/dpo": 0.6158527135848999,
"losses/sft": 0.9492118954658508,
"losses/total": 0.6158527135848999,
"ref_logps/chosen": -22.560768127441406,
"ref_logps/rejected": -26.671247482299805,
"rewards/accuracies": 0.7265625,
"rewards/chosen": -0.5499600172042847,
"rewards/margins": 0.3719918131828308,
"rewards/rejected": -0.9219518899917603,
"step": 319
},
{
"epoch": 2.42,
"learning_rate": 1.0674157303370785e-07,
"logps/chosen": -28.077362060546875,
"logps/rejected": -32.179466247558594,
"loss": 0.579,
"losses/dpo": 0.5990191698074341,
"losses/sft": 1.0173970460891724,
"losses/total": 0.5990191698074341,
"ref_logps/chosen": -23.08903694152832,
"ref_logps/rejected": -23.298954010009766,
"rewards/accuracies": 0.6796875,
"rewards/chosen": -0.4988323450088501,
"rewards/margins": 0.38921868801116943,
"rewards/rejected": -0.8880510330200195,
"step": 320
},
{
"epoch": 2.42,
"learning_rate": 1.0533707865168538e-07,
"logps/chosen": -28.690523147583008,
"logps/rejected": -34.72178268432617,
"loss": 0.5694,
"losses/dpo": 0.6682005524635315,
"losses/sft": 0.9579723477363586,
"losses/total": 0.6682005524635315,
"ref_logps/chosen": -23.495473861694336,
"ref_logps/rejected": -25.300090789794922,
"rewards/accuracies": 0.7265625,
"rewards/chosen": -0.519504964351654,
"rewards/margins": 0.42266416549682617,
"rewards/rejected": -0.942169189453125,
"step": 321
},
{
"epoch": 2.43,
"learning_rate": 1.0393258426966293e-07,
"logps/chosen": -25.57655143737793,
"logps/rejected": -33.412723541259766,
"loss": 0.5471,
"losses/dpo": 0.6572707891464233,
"losses/sft": 1.028795599937439,
"losses/total": 0.6572707891464233,
"ref_logps/chosen": -21.189647674560547,
"ref_logps/rejected": -24.523921966552734,
"rewards/accuracies": 0.75,
"rewards/chosen": -0.4386903643608093,
"rewards/margins": 0.45018988847732544,
"rewards/rejected": -0.8888803124427795,
"step": 322
},
{
"epoch": 2.44,
"learning_rate": 1.0252808988764044e-07,
"logps/chosen": -29.36073112487793,
"logps/rejected": -33.21003723144531,
"loss": 0.5813,
"losses/dpo": 0.5923129916191101,
"losses/sft": 0.9457908272743225,
"losses/total": 0.5923129916191101,
"ref_logps/chosen": -24.347349166870117,
"ref_logps/rejected": -24.210060119628906,
"rewards/accuracies": 0.7265625,
"rewards/chosen": -0.5013381838798523,
"rewards/margins": 0.3986593186855316,
"rewards/rejected": -0.8999974727630615,
"step": 323
},
{
"epoch": 2.45,
"learning_rate": 1.0112359550561797e-07,
"logps/chosen": -25.595857620239258,
"logps/rejected": -36.034000396728516,
"loss": 0.5458,
"losses/dpo": 0.5657480359077454,
"losses/sft": 0.9646883606910706,
"losses/total": 0.5657480359077454,
"ref_logps/chosen": -21.133930206298828,
"ref_logps/rejected": -26.436534881591797,
"rewards/accuracies": 0.7265625,
"rewards/chosen": -0.44619300961494446,
"rewards/margins": 0.5135533809661865,
"rewards/rejected": -0.9597463607788086,
"step": 324
},
{
"epoch": 2.45,
"learning_rate": 9.971910112359549e-08,
"logps/chosen": -25.68283462524414,
"logps/rejected": -35.9984130859375,
"loss": 0.5137,
"losses/dpo": 0.5698142051696777,
"losses/sft": 0.9736462235450745,
"losses/total": 0.5698142051696777,
"ref_logps/chosen": -21.976032257080078,
"ref_logps/rejected": -26.659069061279297,
"rewards/accuracies": 0.796875,
"rewards/chosen": -0.370680034160614,
"rewards/margins": 0.5632542371749878,
"rewards/rejected": -0.9339342713356018,
"step": 325
},
{
"epoch": 2.46,
"learning_rate": 9.831460674157303e-08,
"logps/chosen": -27.503202438354492,
"logps/rejected": -37.614471435546875,
"loss": 0.5574,
"losses/dpo": 0.6440725326538086,
"losses/sft": 0.963034987449646,
"losses/total": 0.6440725326538086,
"ref_logps/chosen": -22.742584228515625,
"ref_logps/rejected": -28.33091163635254,
"rewards/accuracies": 0.703125,
"rewards/chosen": -0.47606179118156433,
"rewards/margins": 0.45229417085647583,
"rewards/rejected": -0.9283559918403625,
"step": 326
},
{
"epoch": 2.47,
"learning_rate": 9.691011235955055e-08,
"logps/chosen": -27.817134857177734,
"logps/rejected": -34.62038803100586,
"loss": 0.5561,
"losses/dpo": 0.6037241816520691,
"losses/sft": 0.9915317296981812,
"losses/total": 0.6037241816520691,
"ref_logps/chosen": -23.01769256591797,
"ref_logps/rejected": -25.366844177246094,
"rewards/accuracies": 0.7421875,
"rewards/chosen": -0.47994428873062134,
"rewards/margins": 0.44541001319885254,
"rewards/rejected": -0.9253543019294739,
"step": 327
},
{
"epoch": 2.48,
"learning_rate": 9.550561797752808e-08,
"logps/chosen": -29.053791046142578,
"logps/rejected": -36.763641357421875,
"loss": 0.5811,
"losses/dpo": 0.5880983471870422,
"losses/sft": 1.1540213823318481,
"losses/total": 0.5880983471870422,
"ref_logps/chosen": -23.688079833984375,
"ref_logps/rejected": -27.189102172851562,
"rewards/accuracies": 0.6796875,
"rewards/chosen": -0.5365712642669678,
"rewards/margins": 0.42088285088539124,
"rewards/rejected": -0.9574541449546814,
"step": 328
},
{
"epoch": 2.48,
"learning_rate": 9.410112359550561e-08,
"logps/chosen": -25.49301528930664,
"logps/rejected": -34.453372955322266,
"loss": 0.56,
"losses/dpo": 0.4375653862953186,
"losses/sft": 1.0353739261627197,
"losses/total": 0.4375653862953186,
"ref_logps/chosen": -21.16693115234375,
"ref_logps/rejected": -25.54058837890625,
"rewards/accuracies": 0.7265625,
"rewards/chosen": -0.43260854482650757,
"rewards/margins": 0.45866984128952026,
"rewards/rejected": -0.8912783861160278,
"step": 329
},
{
"epoch": 2.49,
"learning_rate": 9.269662921348314e-08,
"logps/chosen": -27.841381072998047,
"logps/rejected": -31.897401809692383,
"loss": 0.5787,
"losses/dpo": 0.6422601938247681,
"losses/sft": 0.9122541546821594,
"losses/total": 0.6422601938247681,
"ref_logps/chosen": -23.25330352783203,
"ref_logps/rejected": -23.617647171020508,
"rewards/accuracies": 0.7109375,
"rewards/chosen": -0.45880773663520813,
"rewards/margins": 0.3691678047180176,
"rewards/rejected": -0.8279755115509033,
"step": 330
},
{
"epoch": 2.5,
"learning_rate": 9.129213483146067e-08,
"logps/chosen": -28.4913330078125,
"logps/rejected": -37.258323669433594,
"loss": 0.5962,
"losses/dpo": 0.7063708901405334,
"losses/sft": 1.0378127098083496,
"losses/total": 0.7063708901405334,
"ref_logps/chosen": -22.901752471923828,
"ref_logps/rejected": -27.694263458251953,
"rewards/accuracies": 0.71875,
"rewards/chosen": -0.558958113193512,
"rewards/margins": 0.3974474370479584,
"rewards/rejected": -0.956405520439148,
"step": 331
},
{
"epoch": 2.51,
"learning_rate": 8.988764044943819e-08,
"logps/chosen": -28.342041015625,
"logps/rejected": -36.76737976074219,
"loss": 0.5382,
"losses/dpo": 0.6065940856933594,
"losses/sft": 1.061606526374817,
"losses/total": 0.6065940856933594,
"ref_logps/chosen": -22.77169418334961,
"ref_logps/rejected": -26.355728149414062,
"rewards/accuracies": 0.7578125,
"rewards/chosen": -0.5570348501205444,
"rewards/margins": 0.48413002490997314,
"rewards/rejected": -1.0411648750305176,
"step": 332
},
{
"epoch": 2.51,
"learning_rate": 8.848314606741572e-08,
"logps/chosen": -28.1925048828125,
"logps/rejected": -36.757789611816406,
"loss": 0.542,
"losses/dpo": 0.5146865844726562,
"losses/sft": 0.82643723487854,
"losses/total": 0.5146865844726562,
"ref_logps/chosen": -22.9779052734375,
"ref_logps/rejected": -26.85318374633789,
"rewards/accuracies": 0.75,
"rewards/chosen": -0.5214601755142212,
"rewards/margins": 0.4690002501010895,
"rewards/rejected": -0.9904604554176331,
"step": 333
},
{
"epoch": 2.52,
"learning_rate": 8.707865168539325e-08,
"logps/chosen": -26.62852668762207,
"logps/rejected": -34.3861083984375,
"loss": 0.5288,
"losses/dpo": 0.554874062538147,
"losses/sft": 0.9589724540710449,
"losses/total": 0.554874062538147,
"ref_logps/chosen": -22.37548065185547,
"ref_logps/rejected": -25.256423950195312,
"rewards/accuracies": 0.7578125,
"rewards/chosen": -0.4253048598766327,
"rewards/margins": 0.4876634180545807,
"rewards/rejected": -0.9129682183265686,
"step": 334
},
{
"epoch": 2.53,
"learning_rate": 8.567415730337078e-08,
"logps/chosen": -30.657304763793945,
"logps/rejected": -35.13941192626953,
"loss": 0.564,
"losses/dpo": 0.5628042817115784,
"losses/sft": 0.9816582202911377,
"losses/total": 0.5628042817115784,
"ref_logps/chosen": -25.568553924560547,
"ref_logps/rejected": -26.13833236694336,
"rewards/accuracies": 0.765625,
"rewards/chosen": -0.5088753700256348,
"rewards/margins": 0.3912326395511627,
"rewards/rejected": -0.9001079797744751,
"step": 335
},
{
"epoch": 2.54,
"learning_rate": 8.426966292134831e-08,
"logps/chosen": -27.922801971435547,
"logps/rejected": -35.52781677246094,
"loss": 0.5751,
"losses/dpo": 0.5604207515716553,
"losses/sft": 0.9308174848556519,
"losses/total": 0.5604207515716553,
"ref_logps/chosen": -22.82427978515625,
"ref_logps/rejected": -26.594511032104492,
"rewards/accuracies": 0.7578125,
"rewards/chosen": -0.5098517537117004,
"rewards/margins": 0.38347893953323364,
"rewards/rejected": -0.8933306932449341,
"step": 336
},
{
"epoch": 2.54,
"learning_rate": 8.286516853932583e-08,
"logps/chosen": -26.58924674987793,
"logps/rejected": -37.19468688964844,
"loss": 0.5092,
"losses/dpo": 0.5189211368560791,
"losses/sft": 0.9888613224029541,
"losses/total": 0.5189211368560791,
"ref_logps/chosen": -22.07261085510254,
"ref_logps/rejected": -26.852371215820312,
"rewards/accuracies": 0.75,
"rewards/chosen": -0.45166367292404175,
"rewards/margins": 0.5825679302215576,
"rewards/rejected": -1.0342316627502441,
"step": 337
},
{
"epoch": 2.55,
"learning_rate": 8.146067415730337e-08,
"logps/chosen": -26.25957489013672,
"logps/rejected": -34.435699462890625,
"loss": 0.5636,
"losses/dpo": 0.5064201951026917,
"losses/sft": 1.0053503513336182,
"losses/total": 0.5064201951026917,
"ref_logps/chosen": -21.541664123535156,
"ref_logps/rejected": -25.595813751220703,
"rewards/accuracies": 0.7265625,
"rewards/chosen": -0.47179120779037476,
"rewards/margins": 0.41219767928123474,
"rewards/rejected": -0.8839888572692871,
"step": 338
},
{
"epoch": 2.56,
"learning_rate": 8.005617977528089e-08,
"logps/chosen": -28.839031219482422,
"logps/rejected": -37.7884635925293,
"loss": 0.5099,
"losses/dpo": 0.5300096273422241,
"losses/sft": 0.9853606224060059,
"losses/total": 0.5300096273422241,
"ref_logps/chosen": -24.249427795410156,
"ref_logps/rejected": -27.3421688079834,
"rewards/accuracies": 0.78125,
"rewards/chosen": -0.4589604139328003,
"rewards/margins": 0.5856689810752869,
"rewards/rejected": -1.0446293354034424,
"step": 339
},
{
"epoch": 2.57,
"learning_rate": 7.865168539325842e-08,
"logps/chosen": -24.501375198364258,
"logps/rejected": -35.306766510009766,
"loss": 0.5489,
"losses/dpo": 0.5961363315582275,
"losses/sft": 1.0056707859039307,
"losses/total": 0.5961363315582275,
"ref_logps/chosen": -19.69840431213379,
"ref_logps/rejected": -26.113300323486328,
"rewards/accuracies": 0.75,
"rewards/chosen": -0.4802970886230469,
"rewards/margins": 0.43904954195022583,
"rewards/rejected": -0.9193466305732727,
"step": 340
},
{
"epoch": 2.57,
"learning_rate": 7.724719101123594e-08,
"logps/chosen": -28.589330673217773,
"logps/rejected": -37.83360290527344,
"loss": 0.5047,
"losses/dpo": 0.584464430809021,
"losses/sft": 1.1327065229415894,
"losses/total": 0.584464430809021,
"ref_logps/chosen": -23.609786987304688,
"ref_logps/rejected": -27.048458099365234,
"rewards/accuracies": 0.7734375,
"rewards/chosen": -0.49795451760292053,
"rewards/margins": 0.5805596113204956,
"rewards/rejected": -1.0785142183303833,
"step": 341
},
{
"epoch": 2.58,
"learning_rate": 7.584269662921348e-08,
"logps/chosen": -26.895156860351562,
"logps/rejected": -36.51441192626953,
"loss": 0.5664,
"losses/dpo": 0.5289937257766724,
"losses/sft": 0.9366389513015747,
"losses/total": 0.5289937257766724,
"ref_logps/chosen": -21.687744140625,
"ref_logps/rejected": -27.214290618896484,
"rewards/accuracies": 0.6953125,
"rewards/chosen": -0.5207412838935852,
"rewards/margins": 0.40927091240882874,
"rewards/rejected": -0.9300122261047363,
"step": 342
},
{
"epoch": 2.59,
"learning_rate": 7.443820224719101e-08,
"logps/chosen": -26.770729064941406,
"logps/rejected": -37.010292053222656,
"loss": 0.5614,
"losses/dpo": 0.5385686159133911,
"losses/sft": 1.026196002960205,
"losses/total": 0.5385686159133911,
"ref_logps/chosen": -21.59899139404297,
"ref_logps/rejected": -27.5603084564209,
"rewards/accuracies": 0.71875,
"rewards/chosen": -0.5171737670898438,
"rewards/margins": 0.4278249144554138,
"rewards/rejected": -0.9449986815452576,
"step": 343
},
{
"epoch": 2.6,
"learning_rate": 7.303370786516853e-08,
"logps/chosen": -28.282766342163086,
"logps/rejected": -37.94123840332031,
"loss": 0.5574,
"losses/dpo": 0.4933924973011017,
"losses/sft": 1.0346543788909912,
"losses/total": 0.4933924973011017,
"ref_logps/chosen": -23.10584831237793,
"ref_logps/rejected": -27.940820693969727,
"rewards/accuracies": 0.7265625,
"rewards/chosen": -0.5176920294761658,
"rewards/margins": 0.48235008120536804,
"rewards/rejected": -1.0000420808792114,
"step": 344
},
{
"epoch": 2.6,
"learning_rate": 7.162921348314606e-08,
"logps/chosen": -28.72567367553711,
"logps/rejected": -39.72615432739258,
"loss": 0.5095,
"losses/dpo": 0.46729788184165955,
"losses/sft": 1.0185084342956543,
"losses/total": 0.46729788184165955,
"ref_logps/chosen": -23.77487564086914,
"ref_logps/rejected": -29.071794509887695,
"rewards/accuracies": 0.8046875,
"rewards/chosen": -0.4950796663761139,
"rewards/margins": 0.5703563690185547,
"rewards/rejected": -1.0654358863830566,
"step": 345
},
{
"epoch": 2.61,
"learning_rate": 7.022471910112359e-08,
"logps/chosen": -25.868453979492188,
"logps/rejected": -38.04795455932617,
"loss": 0.5312,
"losses/dpo": 0.5006756782531738,
"losses/sft": 0.9233719110488892,
"losses/total": 0.5006756782531738,
"ref_logps/chosen": -21.202287673950195,
"ref_logps/rejected": -28.387357711791992,
"rewards/accuracies": 0.78125,
"rewards/chosen": -0.4666164517402649,
"rewards/margins": 0.49944305419921875,
"rewards/rejected": -0.9660595059394836,
"step": 346
},
{
"epoch": 2.62,
"learning_rate": 6.882022471910112e-08,
"logps/chosen": -30.674976348876953,
"logps/rejected": -37.18801498413086,
"loss": 0.5609,
"losses/dpo": 0.53383469581604,
"losses/sft": 1.0966167449951172,
"losses/total": 0.53383469581604,
"ref_logps/chosen": -25.279560089111328,
"ref_logps/rejected": -27.363300323486328,
"rewards/accuracies": 0.734375,
"rewards/chosen": -0.5395419001579285,
"rewards/margins": 0.4429297149181366,
"rewards/rejected": -0.9824715852737427,
"step": 347
},
{
"epoch": 2.63,
"learning_rate": 6.741573033707864e-08,
"logps/chosen": -27.533077239990234,
"logps/rejected": -36.4505729675293,
"loss": 0.5459,
"losses/dpo": 0.5303448438644409,
"losses/sft": 1.0059340000152588,
"losses/total": 0.5303448438644409,
"ref_logps/chosen": -23.062650680541992,
"ref_logps/rejected": -27.07624053955078,
"rewards/accuracies": 0.734375,
"rewards/chosen": -0.4470424950122833,
"rewards/margins": 0.49039074778556824,
"rewards/rejected": -0.9374332427978516,
"step": 348
},
{
"epoch": 2.63,
"learning_rate": 6.601123595505617e-08,
"logps/chosen": -27.371315002441406,
"logps/rejected": -37.460845947265625,
"loss": 0.5395,
"losses/dpo": 0.46134790778160095,
"losses/sft": 1.0326218605041504,
"losses/total": 0.46134790778160095,
"ref_logps/chosen": -21.61273765563965,
"ref_logps/rejected": -26.598316192626953,
"rewards/accuracies": 0.7265625,
"rewards/chosen": -0.5758577585220337,
"rewards/margins": 0.5103954076766968,
"rewards/rejected": -1.0862531661987305,
"step": 349
},
{
"epoch": 2.64,
"learning_rate": 6.460674157303371e-08,
"logps/chosen": -26.683940887451172,
"logps/rejected": -35.51959228515625,
"loss": 0.531,
"losses/dpo": 0.5929858684539795,
"losses/sft": 0.8796969056129456,
"losses/total": 0.5929858684539795,
"ref_logps/chosen": -21.84777069091797,
"ref_logps/rejected": -25.34250259399414,
"rewards/accuracies": 0.7578125,
"rewards/chosen": -0.48361673951148987,
"rewards/margins": 0.5340923070907593,
"rewards/rejected": -1.0177090167999268,
"step": 350
},
{
"epoch": 2.65,
"learning_rate": 6.320224719101123e-08,
"logps/chosen": -27.784767150878906,
"logps/rejected": -36.31642150878906,
"loss": 0.5638,
"losses/dpo": 0.46039754152297974,
"losses/sft": 1.014696478843689,
"losses/total": 0.46039754152297974,
"ref_logps/chosen": -22.830692291259766,
"ref_logps/rejected": -26.99026107788086,
"rewards/accuracies": 0.6484375,
"rewards/chosen": -0.4954075217247009,
"rewards/margins": 0.4372091293334961,
"rewards/rejected": -0.9326165914535522,
"step": 351
},
{
"epoch": 2.66,
"learning_rate": 6.179775280898876e-08,
"logps/chosen": -23.8892765045166,
"logps/rejected": -32.802425384521484,
"loss": 0.5307,
"losses/dpo": 0.5120245218276978,
"losses/sft": 0.9590541124343872,
"losses/total": 0.5120245218276978,
"ref_logps/chosen": -19.922191619873047,
"ref_logps/rejected": -23.754770278930664,
"rewards/accuracies": 0.7890625,
"rewards/chosen": -0.3967083692550659,
"rewards/margins": 0.5080575346946716,
"rewards/rejected": -0.9047658443450928,
"step": 352
},
{
"epoch": 2.66,
"learning_rate": 6.039325842696629e-08,
"logps/chosen": -27.902587890625,
"logps/rejected": -39.759193420410156,
"loss": 0.5216,
"losses/dpo": 0.5157948136329651,
"losses/sft": 0.8797988891601562,
"losses/total": 0.5157948136329651,
"ref_logps/chosen": -22.233232498168945,
"ref_logps/rejected": -28.421966552734375,
"rewards/accuracies": 0.796875,
"rewards/chosen": -0.5669355988502502,
"rewards/margins": 0.5667868852615356,
"rewards/rejected": -1.1337225437164307,
"step": 353
},
{
"epoch": 2.67,
"learning_rate": 5.898876404494382e-08,
"logps/chosen": -27.200105667114258,
"logps/rejected": -38.05504608154297,
"loss": 0.5154,
"losses/dpo": 0.6272658705711365,
"losses/sft": 0.901512086391449,
"losses/total": 0.6272658705711365,
"ref_logps/chosen": -22.22509765625,
"ref_logps/rejected": -27.425247192382812,
"rewards/accuracies": 0.7421875,
"rewards/chosen": -0.49750083684921265,
"rewards/margins": 0.5654786825180054,
"rewards/rejected": -1.0629794597625732,
"step": 354
},
{
"epoch": 2.68,
"learning_rate": 5.758426966292135e-08,
"logps/chosen": -29.658336639404297,
"logps/rejected": -38.597557067871094,
"loss": 0.5507,
"losses/dpo": 0.4642670750617981,
"losses/sft": 1.0486382246017456,
"losses/total": 0.4642670750617981,
"ref_logps/chosen": -23.814481735229492,
"ref_logps/rejected": -27.686166763305664,
"rewards/accuracies": 0.78125,
"rewards/chosen": -0.5843857526779175,
"rewards/margins": 0.5067534446716309,
"rewards/rejected": -1.0911391973495483,
"step": 355
},
{
"epoch": 2.69,
"learning_rate": 5.617977528089887e-08,
"logps/chosen": -23.939620971679688,
"logps/rejected": -33.06968688964844,
"loss": 0.5631,
"losses/dpo": 0.5865851640701294,
"losses/sft": 1.1602400541305542,
"losses/total": 0.5865851640701294,
"ref_logps/chosen": -19.01073455810547,
"ref_logps/rejected": -23.945262908935547,
"rewards/accuracies": 0.7265625,
"rewards/chosen": -0.4928884506225586,
"rewards/margins": 0.4195541441440582,
"rewards/rejected": -0.9124425649642944,
"step": 356
},
{
"epoch": 2.69,
"learning_rate": 5.47752808988764e-08,
"logps/chosen": -27.94991683959961,
"logps/rejected": -36.65930938720703,
"loss": 0.5461,
"losses/dpo": 0.6400465369224548,
"losses/sft": 1.0134565830230713,
"losses/total": 0.6400465369224548,
"ref_logps/chosen": -23.591896057128906,
"ref_logps/rejected": -27.34914779663086,
"rewards/accuracies": 0.7578125,
"rewards/chosen": -0.43580204248428345,
"rewards/margins": 0.4952143728733063,
"rewards/rejected": -0.9310164451599121,
"step": 357
},
{
"epoch": 2.7,
"learning_rate": 5.3370786516853926e-08,
"logps/chosen": -28.148937225341797,
"logps/rejected": -34.08583450317383,
"loss": 0.561,
"losses/dpo": 0.47015029191970825,
"losses/sft": 0.923213005065918,
"losses/total": 0.47015029191970825,
"ref_logps/chosen": -23.29110336303711,
"ref_logps/rejected": -25.051483154296875,
"rewards/accuracies": 0.703125,
"rewards/chosen": -0.4857832193374634,
"rewards/margins": 0.417651891708374,
"rewards/rejected": -0.9034351110458374,
"step": 358
},
{
"epoch": 2.71,
"learning_rate": 5.196629213483146e-08,
"logps/chosen": -26.931848526000977,
"logps/rejected": -35.78190994262695,
"loss": 0.5196,
"losses/dpo": 0.4919354021549225,
"losses/sft": 0.9875601530075073,
"losses/total": 0.4919354021549225,
"ref_logps/chosen": -22.220352172851562,
"ref_logps/rejected": -25.88389778137207,
"rewards/accuracies": 0.7421875,
"rewards/chosen": -0.4711495637893677,
"rewards/margins": 0.51865154504776,
"rewards/rejected": -0.9898011684417725,
"step": 359
},
{
"epoch": 2.72,
"learning_rate": 5.056179775280899e-08,
"logps/chosen": -26.860830307006836,
"logps/rejected": -36.244728088378906,
"loss": 0.5141,
"losses/dpo": 0.5143895745277405,
"losses/sft": 0.8888437747955322,
"losses/total": 0.5143895745277405,
"ref_logps/chosen": -22.12276840209961,
"ref_logps/rejected": -25.870086669921875,
"rewards/accuracies": 0.796875,
"rewards/chosen": -0.4738062918186188,
"rewards/margins": 0.5636579394340515,
"rewards/rejected": -1.0374642610549927,
"step": 360
},
{
"epoch": 2.72,
"learning_rate": 4.915730337078652e-08,
"logps/chosen": -27.645713806152344,
"logps/rejected": -35.34681701660156,
"loss": 0.5612,
"losses/dpo": 0.5186240077018738,
"losses/sft": 1.109127402305603,
"losses/total": 0.5186240077018738,
"ref_logps/chosen": -22.531267166137695,
"ref_logps/rejected": -25.30887222290039,
"rewards/accuracies": 0.6875,
"rewards/chosen": -0.5114448070526123,
"rewards/margins": 0.4923498034477234,
"rewards/rejected": -1.0037946701049805,
"step": 361
},
{
"epoch": 2.73,
"learning_rate": 4.775280898876404e-08,
"logps/chosen": -29.528343200683594,
"logps/rejected": -37.111507415771484,
"loss": 0.5701,
"losses/dpo": 0.5167029500007629,
"losses/sft": 1.1346383094787598,
"losses/total": 0.5167029500007629,
"ref_logps/chosen": -23.850698471069336,
"ref_logps/rejected": -27.338424682617188,
"rewards/accuracies": 0.703125,
"rewards/chosen": -0.5677646398544312,
"rewards/margins": 0.40954357385635376,
"rewards/rejected": -0.9773082137107849,
"step": 362
},
{
"epoch": 2.74,
"learning_rate": 4.634831460674157e-08,
"logps/chosen": -27.099462509155273,
"logps/rejected": -38.734046936035156,
"loss": 0.5367,
"losses/dpo": 0.6075611114501953,
"losses/sft": 1.0922847986221313,
"losses/total": 0.6075611114501953,
"ref_logps/chosen": -21.647756576538086,
"ref_logps/rejected": -27.990768432617188,
"rewards/accuracies": 0.7265625,
"rewards/chosen": -0.5451704859733582,
"rewards/margins": 0.5291576385498047,
"rewards/rejected": -1.0743281841278076,
"step": 363
},
{
"epoch": 2.75,
"learning_rate": 4.4943820224719096e-08,
"logps/chosen": -28.804433822631836,
"logps/rejected": -38.87983703613281,
"loss": 0.5448,
"losses/dpo": 0.5679644346237183,
"losses/sft": 1.123085618019104,
"losses/total": 0.5679644346237183,
"ref_logps/chosen": -23.055761337280273,
"ref_logps/rejected": -27.842578887939453,
"rewards/accuracies": 0.75,
"rewards/chosen": -0.5748672485351562,
"rewards/margins": 0.5288586020469666,
"rewards/rejected": -1.1037259101867676,
"step": 364
},
{
"epoch": 2.75,
"learning_rate": 4.3539325842696626e-08,
"logps/chosen": -29.942031860351562,
"logps/rejected": -37.742164611816406,
"loss": 0.5544,
"losses/dpo": 0.4389882981777191,
"losses/sft": 0.9757397174835205,
"losses/total": 0.4389882981777191,
"ref_logps/chosen": -24.796215057373047,
"ref_logps/rejected": -27.602325439453125,
"rewards/accuracies": 0.6640625,
"rewards/chosen": -0.5145817995071411,
"rewards/margins": 0.4994018077850342,
"rewards/rejected": -1.0139836072921753,
"step": 365
},
{
"epoch": 2.76,
"learning_rate": 4.213483146067416e-08,
"logps/chosen": -30.154991149902344,
"logps/rejected": -35.81608581542969,
"loss": 0.57,
"losses/dpo": 0.571212887763977,
"losses/sft": 0.8268208503723145,
"losses/total": 0.571212887763977,
"ref_logps/chosen": -24.421096801757812,
"ref_logps/rejected": -25.88280487060547,
"rewards/accuracies": 0.75,
"rewards/chosen": -0.5733895897865295,
"rewards/margins": 0.4199383854866028,
"rewards/rejected": -0.9933279752731323,
"step": 366
},
{
"epoch": 2.77,
"learning_rate": 4.073033707865169e-08,
"logps/chosen": -27.25971794128418,
"logps/rejected": -33.205955505371094,
"loss": 0.5874,
"losses/dpo": 0.4875527620315552,
"losses/sft": 0.8703315854072571,
"losses/total": 0.4875527620315552,
"ref_logps/chosen": -22.749954223632812,
"ref_logps/rejected": -24.781803131103516,
"rewards/accuracies": 0.734375,
"rewards/chosen": -0.45097634196281433,
"rewards/margins": 0.3914392292499542,
"rewards/rejected": -0.8424156308174133,
"step": 367
},
{
"epoch": 2.78,
"learning_rate": 3.932584269662921e-08,
"logps/chosen": -28.230928421020508,
"logps/rejected": -37.86750030517578,
"loss": 0.508,
"losses/dpo": 0.4668968617916107,
"losses/sft": 1.1078698635101318,
"losses/total": 0.4668968617916107,
"ref_logps/chosen": -23.454715728759766,
"ref_logps/rejected": -27.212678909301758,
"rewards/accuracies": 0.765625,
"rewards/chosen": -0.47762107849121094,
"rewards/margins": 0.5878612995147705,
"rewards/rejected": -1.0654823780059814,
"step": 368
},
{
"epoch": 2.78,
"learning_rate": 3.792134831460674e-08,
"logps/chosen": -28.5417423248291,
"logps/rejected": -39.07720184326172,
"loss": 0.5722,
"losses/dpo": 0.5119404196739197,
"losses/sft": 1.0701940059661865,
"losses/total": 0.5119404196739197,
"ref_logps/chosen": -22.97249984741211,
"ref_logps/rejected": -29.236312866210938,
"rewards/accuracies": 0.6875,
"rewards/chosen": -0.5569244623184204,
"rewards/margins": 0.42716455459594727,
"rewards/rejected": -0.9840888977050781,
"step": 369
},
{
"epoch": 2.79,
"learning_rate": 3.6516853932584266e-08,
"logps/chosen": -24.37343406677246,
"logps/rejected": -35.577354431152344,
"loss": 0.5144,
"losses/dpo": 0.39502987265586853,
"losses/sft": 1.0756311416625977,
"losses/total": 0.39502987265586853,
"ref_logps/chosen": -19.630115509033203,
"ref_logps/rejected": -25.026439666748047,
"rewards/accuracies": 0.765625,
"rewards/chosen": -0.474331796169281,
"rewards/margins": 0.5807597041130066,
"rewards/rejected": -1.055091381072998,
"step": 370
},
{
"epoch": 2.8,
"learning_rate": 3.5112359550561796e-08,
"logps/chosen": -25.75430679321289,
"logps/rejected": -35.49622344970703,
"loss": 0.5757,
"losses/dpo": 0.5865879058837891,
"losses/sft": 1.0159986019134521,
"losses/total": 0.5865879058837891,
"ref_logps/chosen": -21.11154556274414,
"ref_logps/rejected": -26.56639862060547,
"rewards/accuracies": 0.6875,
"rewards/chosen": -0.4642760157585144,
"rewards/margins": 0.42870670557022095,
"rewards/rejected": -0.8929827213287354,
"step": 371
},
{
"epoch": 2.81,
"learning_rate": 3.370786516853932e-08,
"logps/chosen": -28.015663146972656,
"logps/rejected": -36.934810638427734,
"loss": 0.507,
"losses/dpo": 0.6495200395584106,
"losses/sft": 1.097916841506958,
"losses/total": 0.6495200395584106,
"ref_logps/chosen": -23.48037338256836,
"ref_logps/rejected": -26.453636169433594,
"rewards/accuracies": 0.7734375,
"rewards/chosen": -0.45352903008461,
"rewards/margins": 0.5945882797241211,
"rewards/rejected": -1.0481172800064087,
"step": 372
},
{
"epoch": 2.82,
"learning_rate": 3.230337078651686e-08,
"logps/chosen": -29.112939834594727,
"logps/rejected": -35.190895080566406,
"loss": 0.5557,
"losses/dpo": 0.3998969793319702,
"losses/sft": 0.9329382181167603,
"losses/total": 0.3998969793319702,
"ref_logps/chosen": -23.576570510864258,
"ref_logps/rejected": -24.735258102416992,
"rewards/accuracies": 0.75,
"rewards/chosen": -0.5536371469497681,
"rewards/margins": 0.49192649126052856,
"rewards/rejected": -1.0455635786056519,
"step": 373
},
{
"epoch": 2.82,
"learning_rate": 3.089887640449438e-08,
"logps/chosen": -28.713830947875977,
"logps/rejected": -37.038963317871094,
"loss": 0.5536,
"losses/dpo": 0.548796534538269,
"losses/sft": 1.0410091876983643,
"losses/total": 0.548796534538269,
"ref_logps/chosen": -23.76250457763672,
"ref_logps/rejected": -27.5520076751709,
"rewards/accuracies": 0.6875,
"rewards/chosen": -0.49513280391693115,
"rewards/margins": 0.45356276631355286,
"rewards/rejected": -0.9486956000328064,
"step": 374
},
{
"epoch": 2.83,
"learning_rate": 2.949438202247191e-08,
"logps/chosen": -29.465068817138672,
"logps/rejected": -39.406578063964844,
"loss": 0.5343,
"losses/dpo": 0.6983579397201538,
"losses/sft": 1.0986469984054565,
"losses/total": 0.6983579397201538,
"ref_logps/chosen": -24.418071746826172,
"ref_logps/rejected": -28.89803695678711,
"rewards/accuracies": 0.75,
"rewards/chosen": -0.5046992897987366,
"rewards/margins": 0.5461547374725342,
"rewards/rejected": -1.050853967666626,
"step": 375
},
{
"epoch": 2.84,
"learning_rate": 2.8089887640449436e-08,
"logps/chosen": -27.72464370727539,
"logps/rejected": -35.225887298583984,
"loss": 0.5838,
"losses/dpo": 0.615436851978302,
"losses/sft": 1.064025640487671,
"losses/total": 0.615436851978302,
"ref_logps/chosen": -22.502582550048828,
"ref_logps/rejected": -26.32878875732422,
"rewards/accuracies": 0.7265625,
"rewards/chosen": -0.5222063660621643,
"rewards/margins": 0.36750373244285583,
"rewards/rejected": -0.8897100687026978,
"step": 376
},
{
"epoch": 2.85,
"learning_rate": 2.6685393258426963e-08,
"logps/chosen": -26.368640899658203,
"logps/rejected": -35.305564880371094,
"loss": 0.5351,
"losses/dpo": 0.5552591681480408,
"losses/sft": 0.8796924352645874,
"losses/total": 0.5552591681480408,
"ref_logps/chosen": -21.371036529541016,
"ref_logps/rejected": -25.149438858032227,
"rewards/accuracies": 0.71875,
"rewards/chosen": -0.49976038932800293,
"rewards/margins": 0.5158523917198181,
"rewards/rejected": -1.0156128406524658,
"step": 377
},
{
"epoch": 2.85,
"learning_rate": 2.5280898876404493e-08,
"logps/chosen": -29.838565826416016,
"logps/rejected": -38.05325698852539,
"loss": 0.5338,
"losses/dpo": 0.4755927324295044,
"losses/sft": 0.9763241410255432,
"losses/total": 0.4755927324295044,
"ref_logps/chosen": -24.918655395507812,
"ref_logps/rejected": -28.04724884033203,
"rewards/accuracies": 0.734375,
"rewards/chosen": -0.4919911026954651,
"rewards/margins": 0.5086094737052917,
"rewards/rejected": -1.0006005764007568,
"step": 378
},
{
"epoch": 2.86,
"learning_rate": 2.387640449438202e-08,
"logps/chosen": -29.036991119384766,
"logps/rejected": -35.4906005859375,
"loss": 0.5335,
"losses/dpo": 0.5385127067565918,
"losses/sft": 1.245056390762329,
"losses/total": 0.5385127067565918,
"ref_logps/chosen": -23.929513931274414,
"ref_logps/rejected": -24.878217697143555,
"rewards/accuracies": 0.7421875,
"rewards/chosen": -0.5107479095458984,
"rewards/margins": 0.5504903793334961,
"rewards/rejected": -1.0612382888793945,
"step": 379
},
{
"epoch": 2.87,
"learning_rate": 2.2471910112359548e-08,
"logps/chosen": -29.392702102661133,
"logps/rejected": -38.68418884277344,
"loss": 0.545,
"losses/dpo": 0.43943360447883606,
"losses/sft": 1.023887276649475,
"losses/total": 0.43943360447883606,
"ref_logps/chosen": -24.07533073425293,
"ref_logps/rejected": -28.427806854248047,
"rewards/accuracies": 0.75,
"rewards/chosen": -0.5317370891571045,
"rewards/margins": 0.49390077590942383,
"rewards/rejected": -1.0256378650665283,
"step": 380
},
{
"epoch": 2.88,
"learning_rate": 2.106741573033708e-08,
"logps/chosen": -25.038589477539062,
"logps/rejected": -32.384376525878906,
"loss": 0.56,
"losses/dpo": 0.6935892701148987,
"losses/sft": 1.0011663436889648,
"losses/total": 0.6935892701148987,
"ref_logps/chosen": -21.044326782226562,
"ref_logps/rejected": -23.86334991455078,
"rewards/accuracies": 0.71875,
"rewards/chosen": -0.3994261920452118,
"rewards/margins": 0.45267629623413086,
"rewards/rejected": -0.8521024584770203,
"step": 381
},
{
"epoch": 2.88,
"learning_rate": 1.9662921348314606e-08,
"logps/chosen": -30.07229995727539,
"logps/rejected": -37.0654411315918,
"loss": 0.5936,
"losses/dpo": 0.547340989112854,
"losses/sft": 1.0020110607147217,
"losses/total": 0.547340989112854,
"ref_logps/chosen": -23.930465698242188,
"ref_logps/rejected": -27.052518844604492,
"rewards/accuracies": 0.6171875,
"rewards/chosen": -0.6141834259033203,
"rewards/margins": 0.38710883259773254,
"rewards/rejected": -1.0012922286987305,
"step": 382
},
{
"epoch": 2.89,
"learning_rate": 1.8258426966292133e-08,
"logps/chosen": -30.305606842041016,
"logps/rejected": -40.710792541503906,
"loss": 0.537,
"losses/dpo": 0.5175353288650513,
"losses/sft": 0.8916615843772888,
"losses/total": 0.5175353288650513,
"ref_logps/chosen": -25.279661178588867,
"ref_logps/rejected": -29.970672607421875,
"rewards/accuracies": 0.6796875,
"rewards/chosen": -0.5025948286056519,
"rewards/margins": 0.5714170932769775,
"rewards/rejected": -1.0740119218826294,
"step": 383
},
{
"epoch": 2.9,
"learning_rate": 1.685393258426966e-08,
"logps/chosen": -29.87887191772461,
"logps/rejected": -39.89691162109375,
"loss": 0.5598,
"losses/dpo": 0.4781198799610138,
"losses/sft": 1.0425841808319092,
"losses/total": 0.4781198799610138,
"ref_logps/chosen": -23.869295120239258,
"ref_logps/rejected": -29.154647827148438,
"rewards/accuracies": 0.7109375,
"rewards/chosen": -0.6009576916694641,
"rewards/margins": 0.4732685387134552,
"rewards/rejected": -1.0742262601852417,
"step": 384
},
{
"epoch": 2.91,
"learning_rate": 1.544943820224719e-08,
"logps/chosen": -26.600048065185547,
"logps/rejected": -39.657188415527344,
"loss": 0.5186,
"losses/dpo": 0.5135948657989502,
"losses/sft": 0.9224843978881836,
"losses/total": 0.5135948657989502,
"ref_logps/chosen": -21.754756927490234,
"ref_logps/rejected": -29.07170867919922,
"rewards/accuracies": 0.75,
"rewards/chosen": -0.48452913761138916,
"rewards/margins": 0.5740190744400024,
"rewards/rejected": -1.0585482120513916,
"step": 385
},
{
"epoch": 2.91,
"learning_rate": 1.4044943820224718e-08,
"logps/chosen": -27.77488136291504,
"logps/rejected": -37.88126754760742,
"loss": 0.5551,
"losses/dpo": 0.5367317199707031,
"losses/sft": 1.0271828174591064,
"losses/total": 0.5367317199707031,
"ref_logps/chosen": -22.3087158203125,
"ref_logps/rejected": -27.471187591552734,
"rewards/accuracies": 0.7578125,
"rewards/chosen": -0.5466164350509644,
"rewards/margins": 0.4943912625312805,
"rewards/rejected": -1.0410076379776,
"step": 386
},
{
"epoch": 2.92,
"learning_rate": 1.2640449438202247e-08,
"logps/chosen": -28.72400665283203,
"logps/rejected": -36.061241149902344,
"loss": 0.5438,
"losses/dpo": 0.5493422746658325,
"losses/sft": 0.9023943543434143,
"losses/total": 0.5493422746658325,
"ref_logps/chosen": -23.460235595703125,
"ref_logps/rejected": -26.00853729248047,
"rewards/accuracies": 0.71875,
"rewards/chosen": -0.526377260684967,
"rewards/margins": 0.47889336943626404,
"rewards/rejected": -1.0052706003189087,
"step": 387
},
{
"epoch": 2.93,
"learning_rate": 1.1235955056179774e-08,
"logps/chosen": -27.819026947021484,
"logps/rejected": -37.490928649902344,
"loss": 0.5852,
"losses/dpo": 0.5147813558578491,
"losses/sft": 0.8766761422157288,
"losses/total": 0.5147813558578491,
"ref_logps/chosen": -21.90268898010254,
"ref_logps/rejected": -27.36888885498047,
"rewards/accuracies": 0.6875,
"rewards/chosen": -0.5916341543197632,
"rewards/margins": 0.4205697774887085,
"rewards/rejected": -1.0122039318084717,
"step": 388
},
{
"epoch": 2.94,
"learning_rate": 9.831460674157303e-09,
"logps/chosen": -26.303754806518555,
"logps/rejected": -37.83194351196289,
"loss": 0.524,
"losses/dpo": 0.5489503741264343,
"losses/sft": 0.9560513496398926,
"losses/total": 0.5489503741264343,
"ref_logps/chosen": -21.29248046875,
"ref_logps/rejected": -27.238914489746094,
"rewards/accuracies": 0.765625,
"rewards/chosen": -0.5011276006698608,
"rewards/margins": 0.5581751465797424,
"rewards/rejected": -1.059302806854248,
"step": 389
},
{
"epoch": 2.94,
"learning_rate": 8.42696629213483e-09,
"logps/chosen": -26.287546157836914,
"logps/rejected": -38.258975982666016,
"loss": 0.5441,
"losses/dpo": 0.5884628295898438,
"losses/sft": 0.9961035251617432,
"losses/total": 0.5884628295898438,
"ref_logps/chosen": -21.199317932128906,
"ref_logps/rejected": -27.755794525146484,
"rewards/accuracies": 0.7265625,
"rewards/chosen": -0.5088227391242981,
"rewards/margins": 0.5414952635765076,
"rewards/rejected": -1.0503180027008057,
"step": 390
},
{
"epoch": 2.95,
"learning_rate": 7.022471910112359e-09,
"logps/chosen": -29.260208129882812,
"logps/rejected": -35.26235580444336,
"loss": 0.5711,
"losses/dpo": 0.6062160730361938,
"losses/sft": 0.9891349673271179,
"losses/total": 0.6062160730361938,
"ref_logps/chosen": -24.228797912597656,
"ref_logps/rejected": -25.79244613647461,
"rewards/accuracies": 0.71875,
"rewards/chosen": -0.5031411051750183,
"rewards/margins": 0.44384992122650146,
"rewards/rejected": -0.9469910264015198,
"step": 391
},
{
"epoch": 2.96,
"learning_rate": 5.617977528089887e-09,
"logps/chosen": -26.954505920410156,
"logps/rejected": -38.197296142578125,
"loss": 0.5188,
"losses/dpo": 0.48179134726524353,
"losses/sft": 1.0057315826416016,
"losses/total": 0.48179134726524353,
"ref_logps/chosen": -22.015995025634766,
"ref_logps/rejected": -27.583335876464844,
"rewards/accuracies": 0.796875,
"rewards/chosen": -0.49385106563568115,
"rewards/margins": 0.5675452351570129,
"rewards/rejected": -1.0613962411880493,
"step": 392
},
{
"epoch": 2.97,
"learning_rate": 4.213483146067415e-09,
"logps/chosen": -25.941349029541016,
"logps/rejected": -37.711891174316406,
"loss": 0.5126,
"losses/dpo": 0.47302547097206116,
"losses/sft": 1.0042707920074463,
"losses/total": 0.47302547097206116,
"ref_logps/chosen": -21.42403793334961,
"ref_logps/rejected": -27.090473175048828,
"rewards/accuracies": 0.703125,
"rewards/chosen": -0.45173099637031555,
"rewards/margins": 0.610410749912262,
"rewards/rejected": -1.0621416568756104,
"step": 393
},
{
"epoch": 2.97,
"learning_rate": 2.8089887640449435e-09,
"logps/chosen": -27.533342361450195,
"logps/rejected": -40.14276123046875,
"loss": 0.5282,
"losses/dpo": 0.47439950704574585,
"losses/sft": 1.004162073135376,
"losses/total": 0.47439950704574585,
"ref_logps/chosen": -22.44705581665039,
"ref_logps/rejected": -29.271793365478516,
"rewards/accuracies": 0.765625,
"rewards/chosen": -0.5086286067962646,
"rewards/margins": 0.5784677267074585,
"rewards/rejected": -1.0870963335037231,
"step": 394
},
{
"epoch": 2.98,
"learning_rate": 1.4044943820224717e-09,
"logps/chosen": -27.44398307800293,
"logps/rejected": -38.508323669433594,
"loss": 0.5377,
"losses/dpo": 0.5113502740859985,
"losses/sft": 1.0710563659667969,
"losses/total": 0.5113502740859985,
"ref_logps/chosen": -22.568340301513672,
"ref_logps/rejected": -28.23776626586914,
"rewards/accuracies": 0.71875,
"rewards/chosen": -0.4875642657279968,
"rewards/margins": 0.5394913554191589,
"rewards/rejected": -1.0270556211471558,
"step": 395
},
{
"epoch": 2.99,
"learning_rate": 0.0,
"logps/chosen": -28.845203399658203,
"logps/rejected": -36.77953338623047,
"loss": 0.5692,
"losses/dpo": 0.7008877992630005,
"losses/sft": 1.1200252771377563,
"losses/total": 0.7008877992630005,
"ref_logps/chosen": -23.59469985961914,
"ref_logps/rejected": -27.211450576782227,
"rewards/accuracies": 0.703125,
"rewards/chosen": -0.5250504612922668,
"rewards/margins": 0.4317581057548523,
"rewards/rejected": -0.9568085670471191,
"step": 396
},
{
"epoch": 2.99,
"step": 396,
"total_flos": 0.0,
"train_loss": 0.6152852120423558,
"train_runtime": 11562.7876,
"train_samples_per_second": 4.4,
"train_steps_per_second": 0.034
}
],
"logging_steps": 1.0,
"max_steps": 396,
"num_input_tokens_seen": 0,
"num_train_epochs": 3,
"save_steps": 200,
"total_flos": 0.0,
"train_batch_size": 4,
"trial_name": null,
"trial_params": null
}