translator / trainer_state.json
mjkmain's picture
Upload folder using huggingface_hub
9704670 verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 4.972307692307693,
"eval_steps": 1000,
"global_step": 505,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.009846153846153846,
"grad_norm": 38.5,
"learning_rate": 3.125e-06,
"loss": 1.348,
"step": 1
},
{
"epoch": 0.019692307692307693,
"grad_norm": 28.625,
"learning_rate": 6.25e-06,
"loss": 1.3239,
"step": 2
},
{
"epoch": 0.039384615384615386,
"grad_norm": 18.375,
"learning_rate": 1.25e-05,
"loss": 1.3144,
"step": 4
},
{
"epoch": 0.059076923076923075,
"grad_norm": 14.3125,
"learning_rate": 1.8750000000000002e-05,
"loss": 1.4069,
"step": 6
},
{
"epoch": 0.07876923076923077,
"grad_norm": 11.5625,
"learning_rate": 2.5e-05,
"loss": 1.247,
"step": 8
},
{
"epoch": 0.09846153846153846,
"grad_norm": 11.25,
"learning_rate": 3.125e-05,
"loss": 1.2618,
"step": 10
},
{
"epoch": 0.11815384615384615,
"grad_norm": 10.25,
"learning_rate": 3.7500000000000003e-05,
"loss": 1.287,
"step": 12
},
{
"epoch": 0.13784615384615384,
"grad_norm": 5.9375,
"learning_rate": 4.375e-05,
"loss": 1.1393,
"step": 14
},
{
"epoch": 0.15753846153846154,
"grad_norm": 7.34375,
"learning_rate": 5e-05,
"loss": 1.1422,
"step": 16
},
{
"epoch": 0.17723076923076922,
"grad_norm": 7.84375,
"learning_rate": 4.9997936302412985e-05,
"loss": 1.0547,
"step": 18
},
{
"epoch": 0.19692307692307692,
"grad_norm": 7.0,
"learning_rate": 4.9991745550359746e-05,
"loss": 1.0486,
"step": 20
},
{
"epoch": 0.21661538461538463,
"grad_norm": 7.75,
"learning_rate": 4.99814287659075e-05,
"loss": 1.0206,
"step": 22
},
{
"epoch": 0.2363076923076923,
"grad_norm": 13.0625,
"learning_rate": 4.996698765231409e-05,
"loss": 0.9878,
"step": 24
},
{
"epoch": 0.256,
"grad_norm": 4.59375,
"learning_rate": 4.994842459374682e-05,
"loss": 0.9275,
"step": 26
},
{
"epoch": 0.2756923076923077,
"grad_norm": 6.46875,
"learning_rate": 4.992574265488883e-05,
"loss": 0.9555,
"step": 28
},
{
"epoch": 0.2953846153846154,
"grad_norm": 3.890625,
"learning_rate": 4.989894558043312e-05,
"loss": 0.9275,
"step": 30
},
{
"epoch": 0.3150769230769231,
"grad_norm": 4.75,
"learning_rate": 4.986803779446432e-05,
"loss": 0.9301,
"step": 32
},
{
"epoch": 0.33476923076923076,
"grad_norm": 2.71875,
"learning_rate": 4.983302439972829e-05,
"loss": 0.8875,
"step": 34
},
{
"epoch": 0.35446153846153844,
"grad_norm": 6.65625,
"learning_rate": 4.979391117678969e-05,
"loss": 0.8649,
"step": 36
},
{
"epoch": 0.37415384615384617,
"grad_norm": 2.59375,
"learning_rate": 4.975070458307763e-05,
"loss": 0.8497,
"step": 38
},
{
"epoch": 0.39384615384615385,
"grad_norm": 3.625,
"learning_rate": 4.970341175181956e-05,
"loss": 0.8358,
"step": 40
},
{
"epoch": 0.4135384615384615,
"grad_norm": 4.1875,
"learning_rate": 4.9652040490863624e-05,
"loss": 0.8191,
"step": 42
},
{
"epoch": 0.43323076923076925,
"grad_norm": 4.75,
"learning_rate": 4.95965992813896e-05,
"loss": 0.8425,
"step": 44
},
{
"epoch": 0.45292307692307693,
"grad_norm": 2.796875,
"learning_rate": 4.9537097276508704e-05,
"loss": 0.8027,
"step": 46
},
{
"epoch": 0.4726153846153846,
"grad_norm": 2.453125,
"learning_rate": 4.947354429975245e-05,
"loss": 0.812,
"step": 48
},
{
"epoch": 0.49230769230769234,
"grad_norm": 1.9140625,
"learning_rate": 4.940595084345082e-05,
"loss": 0.7979,
"step": 50
},
{
"epoch": 0.512,
"grad_norm": 2.3125,
"learning_rate": 4.933432806700004e-05,
"loss": 0.7927,
"step": 52
},
{
"epoch": 0.5316923076923077,
"grad_norm": 2.953125,
"learning_rate": 4.925868779502015e-05,
"loss": 0.7773,
"step": 54
},
{
"epoch": 0.5513846153846154,
"grad_norm": 3.625,
"learning_rate": 4.9179042515402926e-05,
"loss": 0.7694,
"step": 56
},
{
"epoch": 0.571076923076923,
"grad_norm": 2.109375,
"learning_rate": 4.909540537725007e-05,
"loss": 0.7703,
"step": 58
},
{
"epoch": 0.5907692307692308,
"grad_norm": 1.890625,
"learning_rate": 4.900779018870239e-05,
"loss": 0.8162,
"step": 60
},
{
"epoch": 0.6104615384615385,
"grad_norm": 2.15625,
"learning_rate": 4.891621141466014e-05,
"loss": 0.743,
"step": 62
},
{
"epoch": 0.6301538461538462,
"grad_norm": 1.6875,
"learning_rate": 4.882068417439493e-05,
"loss": 0.7572,
"step": 64
},
{
"epoch": 0.6498461538461539,
"grad_norm": 2.078125,
"learning_rate": 4.872122423905358e-05,
"loss": 0.7445,
"step": 66
},
{
"epoch": 0.6695384615384615,
"grad_norm": 1.4609375,
"learning_rate": 4.8617848029054354e-05,
"loss": 0.7419,
"step": 68
},
{
"epoch": 0.6892307692307692,
"grad_norm": 1.46875,
"learning_rate": 4.851057261137608e-05,
"loss": 0.7402,
"step": 70
},
{
"epoch": 0.7089230769230769,
"grad_norm": 4.09375,
"learning_rate": 4.839941569674041e-05,
"loss": 0.7131,
"step": 72
},
{
"epoch": 0.7286153846153847,
"grad_norm": 4.25,
"learning_rate": 4.8284395636687854e-05,
"loss": 0.6954,
"step": 74
},
{
"epoch": 0.7483076923076923,
"grad_norm": 3.21875,
"learning_rate": 4.816553142054805e-05,
"loss": 0.699,
"step": 76
},
{
"epoch": 0.768,
"grad_norm": 4.125,
"learning_rate": 4.804284267230468e-05,
"loss": 0.6775,
"step": 78
},
{
"epoch": 0.7876923076923077,
"grad_norm": 2.984375,
"learning_rate": 4.791634964735564e-05,
"loss": 0.7056,
"step": 80
},
{
"epoch": 0.8073846153846154,
"grad_norm": 2.59375,
"learning_rate": 4.778607322916896e-05,
"loss": 0.6944,
"step": 82
},
{
"epoch": 0.827076923076923,
"grad_norm": 1.859375,
"learning_rate": 4.765203492583502e-05,
"loss": 0.668,
"step": 84
},
{
"epoch": 0.8467692307692307,
"grad_norm": 2.109375,
"learning_rate": 4.751425686651568e-05,
"loss": 0.673,
"step": 86
},
{
"epoch": 0.8664615384615385,
"grad_norm": 1.84375,
"learning_rate": 4.737276179779083e-05,
"loss": 0.7153,
"step": 88
},
{
"epoch": 0.8861538461538462,
"grad_norm": 2.125,
"learning_rate": 4.722757307990302e-05,
"loss": 0.7234,
"step": 90
},
{
"epoch": 0.9058461538461539,
"grad_norm": 1.3984375,
"learning_rate": 4.707871468290078e-05,
"loss": 0.6231,
"step": 92
},
{
"epoch": 0.9255384615384615,
"grad_norm": 1.6484375,
"learning_rate": 4.69262111826813e-05,
"loss": 0.642,
"step": 94
},
{
"epoch": 0.9452307692307692,
"grad_norm": 1.5625,
"learning_rate": 4.6770087756932995e-05,
"loss": 0.6231,
"step": 96
},
{
"epoch": 0.9649230769230769,
"grad_norm": 1.78125,
"learning_rate": 4.661037018097884e-05,
"loss": 0.671,
"step": 98
},
{
"epoch": 0.9846153846153847,
"grad_norm": 1.5625,
"learning_rate": 4.6447084823520926e-05,
"loss": 0.6657,
"step": 100
},
{
"epoch": 1.0043076923076923,
"grad_norm": 1.5390625,
"learning_rate": 4.62802586422871e-05,
"loss": 0.607,
"step": 102
},
{
"epoch": 1.024,
"grad_norm": 1.5078125,
"learning_rate": 4.610991917958037e-05,
"loss": 0.5739,
"step": 104
},
{
"epoch": 1.0436923076923077,
"grad_norm": 1.484375,
"learning_rate": 4.593609455773181e-05,
"loss": 0.6011,
"step": 106
},
{
"epoch": 1.0633846153846154,
"grad_norm": 1.5234375,
"learning_rate": 4.5758813474457606e-05,
"loss": 0.5776,
"step": 108
},
{
"epoch": 1.083076923076923,
"grad_norm": 1.234375,
"learning_rate": 4.557810519812128e-05,
"loss": 0.5808,
"step": 110
},
{
"epoch": 1.1027692307692307,
"grad_norm": 1.7890625,
"learning_rate": 4.539399956290152e-05,
"loss": 0.5965,
"step": 112
},
{
"epoch": 1.1224615384615384,
"grad_norm": 1.2109375,
"learning_rate": 4.520652696386677e-05,
"loss": 0.608,
"step": 114
},
{
"epoch": 1.142153846153846,
"grad_norm": 1.625,
"learning_rate": 4.5015718351957015e-05,
"loss": 0.5714,
"step": 116
},
{
"epoch": 1.1618461538461538,
"grad_norm": 1.5703125,
"learning_rate": 4.482160522887403e-05,
"loss": 0.5876,
"step": 118
},
{
"epoch": 1.1815384615384614,
"grad_norm": 1.5078125,
"learning_rate": 4.462421964188052e-05,
"loss": 0.5835,
"step": 120
},
{
"epoch": 1.2012307692307693,
"grad_norm": 1.3515625,
"learning_rate": 4.442359417850924e-05,
"loss": 0.5881,
"step": 122
},
{
"epoch": 1.220923076923077,
"grad_norm": 1.9140625,
"learning_rate": 4.421976196118297e-05,
"loss": 0.5471,
"step": 124
},
{
"epoch": 1.2406153846153847,
"grad_norm": 1.625,
"learning_rate": 4.401275664174611e-05,
"loss": 0.5417,
"step": 126
},
{
"epoch": 1.2603076923076924,
"grad_norm": 1.625,
"learning_rate": 4.380261239590892e-05,
"loss": 0.5337,
"step": 128
},
{
"epoch": 1.28,
"grad_norm": 1.5625,
"learning_rate": 4.358936391760524e-05,
"loss": 0.5731,
"step": 130
},
{
"epoch": 1.2996923076923077,
"grad_norm": 1.5625,
"learning_rate": 4.337304641326467e-05,
"loss": 0.5363,
"step": 132
},
{
"epoch": 1.3193846153846154,
"grad_norm": 1.515625,
"learning_rate": 4.315369559600018e-05,
"loss": 0.5566,
"step": 134
},
{
"epoch": 1.339076923076923,
"grad_norm": 1.3359375,
"learning_rate": 4.2931347679711924e-05,
"loss": 0.586,
"step": 136
},
{
"epoch": 1.3587692307692307,
"grad_norm": 1.6015625,
"learning_rate": 4.270603937310859e-05,
"loss": 0.5535,
"step": 138
},
{
"epoch": 1.3784615384615384,
"grad_norm": 1.2734375,
"learning_rate": 4.2477807873646845e-05,
"loss": 0.5788,
"step": 140
},
{
"epoch": 1.398153846153846,
"grad_norm": 1.484375,
"learning_rate": 4.2246690861390294e-05,
"loss": 0.538,
"step": 142
},
{
"epoch": 1.417846153846154,
"grad_norm": 1.453125,
"learning_rate": 4.201272649278856e-05,
"loss": 0.5531,
"step": 144
},
{
"epoch": 1.4375384615384617,
"grad_norm": 1.5,
"learning_rate": 4.177595339437789e-05,
"loss": 0.55,
"step": 146
},
{
"epoch": 1.4572307692307693,
"grad_norm": 1.2890625,
"learning_rate": 4.153641065640402e-05,
"loss": 0.5333,
"step": 148
},
{
"epoch": 1.476923076923077,
"grad_norm": 1.4296875,
"learning_rate": 4.129413782636859e-05,
"loss": 0.5372,
"step": 150
},
{
"epoch": 1.4966153846153847,
"grad_norm": 1.359375,
"learning_rate": 4.1049174902499974e-05,
"loss": 0.5575,
"step": 152
},
{
"epoch": 1.5163076923076924,
"grad_norm": 1.4296875,
"learning_rate": 4.080156232714976e-05,
"loss": 0.5571,
"step": 154
},
{
"epoch": 1.536,
"grad_norm": 1.2421875,
"learning_rate": 4.055134098011589e-05,
"loss": 0.5246,
"step": 156
},
{
"epoch": 1.5556923076923077,
"grad_norm": 1.5546875,
"learning_rate": 4.0298552171893576e-05,
"loss": 0.5597,
"step": 158
},
{
"epoch": 1.5753846153846154,
"grad_norm": 1.3046875,
"learning_rate": 4.0043237636855116e-05,
"loss": 0.5536,
"step": 160
},
{
"epoch": 1.595076923076923,
"grad_norm": 1.5078125,
"learning_rate": 3.978543952635967e-05,
"loss": 0.5527,
"step": 162
},
{
"epoch": 1.6147692307692307,
"grad_norm": 1.3359375,
"learning_rate": 3.952520040179434e-05,
"loss": 0.5137,
"step": 164
},
{
"epoch": 1.6344615384615384,
"grad_norm": 1.34375,
"learning_rate": 3.92625632275474e-05,
"loss": 0.5795,
"step": 166
},
{
"epoch": 1.654153846153846,
"grad_norm": 1.46875,
"learning_rate": 3.899757136391507e-05,
"loss": 0.5237,
"step": 168
},
{
"epoch": 1.6738461538461538,
"grad_norm": 1.28125,
"learning_rate": 3.873026855994292e-05,
"loss": 0.5326,
"step": 170
},
{
"epoch": 1.6935384615384614,
"grad_norm": 1.3125,
"learning_rate": 3.8460698946203054e-05,
"loss": 0.5231,
"step": 172
},
{
"epoch": 1.7132307692307691,
"grad_norm": 1.5546875,
"learning_rate": 3.818890702750841e-05,
"loss": 0.5492,
"step": 174
},
{
"epoch": 1.7329230769230768,
"grad_norm": 1.453125,
"learning_rate": 3.791493767556511e-05,
"loss": 0.6126,
"step": 176
},
{
"epoch": 1.7526153846153845,
"grad_norm": 1.1953125,
"learning_rate": 3.7638836121564415e-05,
"loss": 0.5463,
"step": 178
},
{
"epoch": 1.7723076923076924,
"grad_norm": 1.3515625,
"learning_rate": 3.7360647948715164e-05,
"loss": 0.515,
"step": 180
},
{
"epoch": 1.792,
"grad_norm": 1.296875,
"learning_rate": 3.708041908471827e-05,
"loss": 0.5259,
"step": 182
},
{
"epoch": 1.8116923076923077,
"grad_norm": 1.578125,
"learning_rate": 3.679819579418414e-05,
"loss": 0.5059,
"step": 184
},
{
"epoch": 1.8313846153846154,
"grad_norm": 1.375,
"learning_rate": 3.651402467099468e-05,
"loss": 0.5709,
"step": 186
},
{
"epoch": 1.851076923076923,
"grad_norm": 1.21875,
"learning_rate": 3.622795263061079e-05,
"loss": 0.5628,
"step": 188
},
{
"epoch": 1.8707692307692307,
"grad_norm": 1.4609375,
"learning_rate": 3.594002690232682e-05,
"loss": 0.5066,
"step": 190
},
{
"epoch": 1.8904615384615384,
"grad_norm": 1.3203125,
"learning_rate": 3.565029502147323e-05,
"loss": 0.5625,
"step": 192
},
{
"epoch": 1.9101538461538463,
"grad_norm": 1.6796875,
"learning_rate": 3.53588048215687e-05,
"loss": 0.5336,
"step": 194
},
{
"epoch": 1.929846153846154,
"grad_norm": 1.4921875,
"learning_rate": 3.506560442642299e-05,
"loss": 0.5215,
"step": 196
},
{
"epoch": 1.9495384615384617,
"grad_norm": 1.546875,
"learning_rate": 3.4770742242191945e-05,
"loss": 0.5296,
"step": 198
},
{
"epoch": 1.9692307692307693,
"grad_norm": 2.421875,
"learning_rate": 3.4474266949385817e-05,
"loss": 0.523,
"step": 200
},
{
"epoch": 1.988923076923077,
"grad_norm": 1.34375,
"learning_rate": 3.4176227494832305e-05,
"loss": 0.4856,
"step": 202
},
{
"epoch": 2.0086153846153847,
"grad_norm": 1.3984375,
"learning_rate": 3.387667308359568e-05,
"loss": 0.5298,
"step": 204
},
{
"epoch": 2.0283076923076924,
"grad_norm": 1.375,
"learning_rate": 3.3575653170853175e-05,
"loss": 0.4869,
"step": 206
},
{
"epoch": 2.048,
"grad_norm": 1.0703125,
"learning_rate": 3.327321745373021e-05,
"loss": 0.479,
"step": 208
},
{
"epoch": 2.0676923076923077,
"grad_norm": 1.234375,
"learning_rate": 3.2969415863095556e-05,
"loss": 0.4935,
"step": 210
},
{
"epoch": 2.0873846153846154,
"grad_norm": 1.2734375,
"learning_rate": 3.266429855531797e-05,
"loss": 0.4773,
"step": 212
},
{
"epoch": 2.107076923076923,
"grad_norm": 1.3515625,
"learning_rate": 3.2357915903985605e-05,
"loss": 0.4611,
"step": 214
},
{
"epoch": 2.1267692307692307,
"grad_norm": 1.2265625,
"learning_rate": 3.2050318491589506e-05,
"loss": 0.469,
"step": 216
},
{
"epoch": 2.1464615384615384,
"grad_norm": 1.2109375,
"learning_rate": 3.174155710117271e-05,
"loss": 0.4758,
"step": 218
},
{
"epoch": 2.166153846153846,
"grad_norm": 1.34375,
"learning_rate": 3.143168270794612e-05,
"loss": 0.4933,
"step": 220
},
{
"epoch": 2.1858461538461538,
"grad_norm": 1.2109375,
"learning_rate": 3.112074647087274e-05,
"loss": 0.4814,
"step": 222
},
{
"epoch": 2.2055384615384614,
"grad_norm": 1.203125,
"learning_rate": 3.080879972422154e-05,
"loss": 0.5064,
"step": 224
},
{
"epoch": 2.225230769230769,
"grad_norm": 1.34375,
"learning_rate": 3.0495893969092392e-05,
"loss": 0.4576,
"step": 226
},
{
"epoch": 2.244923076923077,
"grad_norm": 1.4453125,
"learning_rate": 3.0182080864913452e-05,
"loss": 0.4902,
"step": 228
},
{
"epoch": 2.2646153846153845,
"grad_norm": 1.21875,
"learning_rate": 2.9867412220912373e-05,
"loss": 0.4486,
"step": 230
},
{
"epoch": 2.284307692307692,
"grad_norm": 1.1953125,
"learning_rate": 2.9551939987562866e-05,
"loss": 0.4786,
"step": 232
},
{
"epoch": 2.304,
"grad_norm": 1.2578125,
"learning_rate": 2.923571624800787e-05,
"loss": 0.4814,
"step": 234
},
{
"epoch": 2.3236923076923075,
"grad_norm": 1.3359375,
"learning_rate": 2.891879320946086e-05,
"loss": 0.4915,
"step": 236
},
{
"epoch": 2.3433846153846156,
"grad_norm": 1.2265625,
"learning_rate": 2.8601223194586612e-05,
"loss": 0.4931,
"step": 238
},
{
"epoch": 2.363076923076923,
"grad_norm": 1.4453125,
"learning_rate": 2.8283058632863003e-05,
"loss": 0.481,
"step": 240
},
{
"epoch": 2.382769230769231,
"grad_norm": 1.4140625,
"learning_rate": 2.7964352051925103e-05,
"loss": 0.4458,
"step": 242
},
{
"epoch": 2.4024615384615386,
"grad_norm": 1.28125,
"learning_rate": 2.7645156068893073e-05,
"loss": 0.499,
"step": 244
},
{
"epoch": 2.4221538461538463,
"grad_norm": 2.078125,
"learning_rate": 2.732552338168531e-05,
"loss": 0.4937,
"step": 246
},
{
"epoch": 2.441846153846154,
"grad_norm": 1.1875,
"learning_rate": 2.7005506760318235e-05,
"loss": 0.4628,
"step": 248
},
{
"epoch": 2.4615384615384617,
"grad_norm": 1.421875,
"learning_rate": 2.66851590381942e-05,
"loss": 0.4741,
"step": 250
},
{
"epoch": 2.4812307692307694,
"grad_norm": 1.21875,
"learning_rate": 2.6364533103378896e-05,
"loss": 0.4569,
"step": 252
},
{
"epoch": 2.500923076923077,
"grad_norm": 1.3515625,
"learning_rate": 2.604368188986977e-05,
"loss": 0.4851,
"step": 254
},
{
"epoch": 2.5206153846153847,
"grad_norm": 1.4921875,
"learning_rate": 2.5722658368856816e-05,
"loss": 0.4935,
"step": 256
},
{
"epoch": 2.5403076923076924,
"grad_norm": 1.203125,
"learning_rate": 2.5401515539977305e-05,
"loss": 0.4947,
"step": 258
},
{
"epoch": 2.56,
"grad_norm": 1.328125,
"learning_rate": 2.5080306422565707e-05,
"loss": 0.4642,
"step": 260
},
{
"epoch": 2.5796923076923077,
"grad_norm": 1.4140625,
"learning_rate": 2.4759084046900486e-05,
"loss": 0.5064,
"step": 262
},
{
"epoch": 2.5993846153846154,
"grad_norm": 1.2734375,
"learning_rate": 2.4437901445448936e-05,
"loss": 0.4376,
"step": 264
},
{
"epoch": 2.619076923076923,
"grad_norm": 1.2734375,
"learning_rate": 2.4116811644111852e-05,
"loss": 0.4861,
"step": 266
},
{
"epoch": 2.6387692307692308,
"grad_norm": 1.3203125,
"learning_rate": 2.379586765346907e-05,
"loss": 0.4878,
"step": 268
},
{
"epoch": 2.6584615384615384,
"grad_norm": 1.3125,
"learning_rate": 2.347512246002774e-05,
"loss": 0.4827,
"step": 270
},
{
"epoch": 2.678153846153846,
"grad_norm": 1.3359375,
"learning_rate": 2.3154629017474384e-05,
"loss": 0.4769,
"step": 272
},
{
"epoch": 2.697846153846154,
"grad_norm": 1.3203125,
"learning_rate": 2.2834440237932536e-05,
"loss": 0.5063,
"step": 274
},
{
"epoch": 2.7175384615384615,
"grad_norm": 1.15625,
"learning_rate": 2.251460898322712e-05,
"loss": 0.4483,
"step": 276
},
{
"epoch": 2.737230769230769,
"grad_norm": 1.3984375,
"learning_rate": 2.219518805615724e-05,
"loss": 0.4855,
"step": 278
},
{
"epoch": 2.756923076923077,
"grad_norm": 1.296875,
"learning_rate": 2.1876230191778598e-05,
"loss": 0.4663,
"step": 280
},
{
"epoch": 2.7766153846153845,
"grad_norm": 1.109375,
"learning_rate": 2.155778804869721e-05,
"loss": 0.5065,
"step": 282
},
{
"epoch": 2.796307692307692,
"grad_norm": 1.25,
"learning_rate": 2.123991420037565e-05,
"loss": 0.4757,
"step": 284
},
{
"epoch": 2.816,
"grad_norm": 1.359375,
"learning_rate": 2.0922661126453432e-05,
"loss": 0.4768,
"step": 286
},
{
"epoch": 2.835692307692308,
"grad_norm": 1.265625,
"learning_rate": 2.0606081204082797e-05,
"loss": 0.4383,
"step": 288
},
{
"epoch": 2.855384615384615,
"grad_norm": 1.5625,
"learning_rate": 2.02902266992815e-05,
"loss": 0.4976,
"step": 290
},
{
"epoch": 2.8750769230769233,
"grad_norm": 1.078125,
"learning_rate": 1.9975149758303883e-05,
"loss": 0.4871,
"step": 292
},
{
"epoch": 2.8947692307692305,
"grad_norm": 1.3125,
"learning_rate": 1.9660902399031782e-05,
"loss": 0.4807,
"step": 294
},
{
"epoch": 2.9144615384615387,
"grad_norm": 1.4765625,
"learning_rate": 1.9347536502386553e-05,
"loss": 0.4544,
"step": 296
},
{
"epoch": 2.934153846153846,
"grad_norm": 1.2109375,
"learning_rate": 1.9035103803763792e-05,
"loss": 0.4924,
"step": 298
},
{
"epoch": 2.953846153846154,
"grad_norm": 1.3828125,
"learning_rate": 1.8723655884491982e-05,
"loss": 0.4846,
"step": 300
},
{
"epoch": 2.9735384615384617,
"grad_norm": 1.21875,
"learning_rate": 1.8413244163316696e-05,
"loss": 0.4921,
"step": 302
},
{
"epoch": 2.9932307692307694,
"grad_norm": 1.3359375,
"learning_rate": 1.8103919887911526e-05,
"loss": 0.4728,
"step": 304
},
{
"epoch": 3.012923076923077,
"grad_norm": 1.109375,
"learning_rate": 1.7795734126417326e-05,
"loss": 0.4531,
"step": 306
},
{
"epoch": 3.0326153846153847,
"grad_norm": 1.328125,
"learning_rate": 1.7488737759011105e-05,
"loss": 0.4468,
"step": 308
},
{
"epoch": 3.0523076923076924,
"grad_norm": 1.234375,
"learning_rate": 1.718298146950585e-05,
"loss": 0.4727,
"step": 310
},
{
"epoch": 3.072,
"grad_norm": 1.09375,
"learning_rate": 1.6878515736982915e-05,
"loss": 0.4429,
"step": 312
},
{
"epoch": 3.0916923076923077,
"grad_norm": 1.1484375,
"learning_rate": 1.657539082745811e-05,
"loss": 0.4304,
"step": 314
},
{
"epoch": 3.1113846153846154,
"grad_norm": 1.125,
"learning_rate": 1.6273656785582986e-05,
"loss": 0.4814,
"step": 316
},
{
"epoch": 3.131076923076923,
"grad_norm": 1.0625,
"learning_rate": 1.597336342638266e-05,
"loss": 0.411,
"step": 318
},
{
"epoch": 3.1507692307692308,
"grad_norm": 1.40625,
"learning_rate": 1.5674560327031613e-05,
"loss": 0.4318,
"step": 320
},
{
"epoch": 3.1704615384615384,
"grad_norm": 1.5703125,
"learning_rate": 1.5377296818668638e-05,
"loss": 0.4685,
"step": 322
},
{
"epoch": 3.190153846153846,
"grad_norm": 1.109375,
"learning_rate": 1.5081621978252548e-05,
"loss": 0.423,
"step": 324
},
{
"epoch": 3.209846153846154,
"grad_norm": 1.2734375,
"learning_rate": 1.47875846204597e-05,
"loss": 0.4587,
"step": 326
},
{
"epoch": 3.2295384615384615,
"grad_norm": 1.078125,
"learning_rate": 1.449523328962496e-05,
"loss": 0.4341,
"step": 328
},
{
"epoch": 3.249230769230769,
"grad_norm": 1.140625,
"learning_rate": 1.420461625172721e-05,
"loss": 0.4596,
"step": 330
},
{
"epoch": 3.268923076923077,
"grad_norm": 1.3828125,
"learning_rate": 1.3915781486420848e-05,
"loss": 0.4357,
"step": 332
},
{
"epoch": 3.2886153846153845,
"grad_norm": 1.2421875,
"learning_rate": 1.3628776679114517e-05,
"loss": 0.4672,
"step": 334
},
{
"epoch": 3.308307692307692,
"grad_norm": 1.34375,
"learning_rate": 1.3343649213098486e-05,
"loss": 0.4494,
"step": 336
},
{
"epoch": 3.328,
"grad_norm": 1.296875,
"learning_rate": 1.3060446161721855e-05,
"loss": 0.4619,
"step": 338
},
{
"epoch": 3.3476923076923075,
"grad_norm": 1.171875,
"learning_rate": 1.277921428062091e-05,
"loss": 0.4561,
"step": 340
},
{
"epoch": 3.367384615384615,
"grad_norm": 1.1484375,
"learning_rate": 1.2500000000000006e-05,
"loss": 0.4275,
"step": 342
},
{
"epoch": 3.387076923076923,
"grad_norm": 1.390625,
"learning_rate": 1.2222849416966117e-05,
"loss": 0.4704,
"step": 344
},
{
"epoch": 3.406769230769231,
"grad_norm": 1.296875,
"learning_rate": 1.1947808287918404e-05,
"loss": 0.4283,
"step": 346
},
{
"epoch": 3.4264615384615382,
"grad_norm": 1.1953125,
"learning_rate": 1.1674922020994022e-05,
"loss": 0.4346,
"step": 348
},
{
"epoch": 3.4461538461538463,
"grad_norm": 1.203125,
"learning_rate": 1.14042356685714e-05,
"loss": 0.4613,
"step": 350
},
{
"epoch": 3.465846153846154,
"grad_norm": 1.3046875,
"learning_rate": 1.1135793919832336e-05,
"loss": 0.4634,
"step": 352
},
{
"epoch": 3.4855384615384617,
"grad_norm": 1.3828125,
"learning_rate": 1.0869641093383962e-05,
"loss": 0.4702,
"step": 354
},
{
"epoch": 3.5052307692307694,
"grad_norm": 1.2578125,
"learning_rate": 1.0605821129941934e-05,
"loss": 0.458,
"step": 356
},
{
"epoch": 3.524923076923077,
"grad_norm": 1.6796875,
"learning_rate": 1.0344377585075998e-05,
"loss": 0.4286,
"step": 358
},
{
"epoch": 3.5446153846153847,
"grad_norm": 1.3125,
"learning_rate": 1.0085353622019175e-05,
"loss": 0.46,
"step": 360
},
{
"epoch": 3.5643076923076924,
"grad_norm": 1.25,
"learning_rate": 9.82879200454167e-06,
"loss": 0.4323,
"step": 362
},
{
"epoch": 3.584,
"grad_norm": 1.3125,
"learning_rate": 9.574735089890766e-06,
"loss": 0.4452,
"step": 364
},
{
"epoch": 3.6036923076923078,
"grad_norm": 1.2734375,
"learning_rate": 9.323224821797782e-06,
"loss": 0.4605,
"step": 366
},
{
"epoch": 3.6233846153846154,
"grad_norm": 1.09375,
"learning_rate": 9.074302723553398e-06,
"loss": 0.4871,
"step": 368
},
{
"epoch": 3.643076923076923,
"grad_norm": 1.21875,
"learning_rate": 8.8280098911523e-06,
"loss": 0.4801,
"step": 370
},
{
"epoch": 3.6627692307692308,
"grad_norm": 1.15625,
"learning_rate": 8.584386986508388e-06,
"loss": 0.4666,
"step": 372
},
{
"epoch": 3.6824615384615385,
"grad_norm": 1.4296875,
"learning_rate": 8.343474230741715e-06,
"loss": 0.4404,
"step": 374
},
{
"epoch": 3.702153846153846,
"grad_norm": 1.1484375,
"learning_rate": 8.105311397538085e-06,
"loss": 0.4526,
"step": 376
},
{
"epoch": 3.721846153846154,
"grad_norm": 1.4296875,
"learning_rate": 7.869937806582642e-06,
"loss": 0.4433,
"step": 378
},
{
"epoch": 3.7415384615384615,
"grad_norm": 1.1484375,
"learning_rate": 7.63739231706833e-06,
"loss": 0.4287,
"step": 380
},
{
"epoch": 3.761230769230769,
"grad_norm": 1.40625,
"learning_rate": 7.407713321280377e-06,
"loss": 0.465,
"step": 382
},
{
"epoch": 3.780923076923077,
"grad_norm": 1.203125,
"learning_rate": 7.180938738257944e-06,
"loss": 0.445,
"step": 384
},
{
"epoch": 3.8006153846153845,
"grad_norm": 1.3203125,
"learning_rate": 6.957106007533826e-06,
"loss": 0.4544,
"step": 386
},
{
"epoch": 3.820307692307692,
"grad_norm": 1.15625,
"learning_rate": 6.736252082953307e-06,
"loss": 0.4508,
"step": 388
},
{
"epoch": 3.84,
"grad_norm": 1.3671875,
"learning_rate": 6.5184134265733e-06,
"loss": 0.4575,
"step": 390
},
{
"epoch": 3.8596923076923075,
"grad_norm": 1.28125,
"learning_rate": 6.303626002642554e-06,
"loss": 0.4432,
"step": 392
},
{
"epoch": 3.879384615384615,
"grad_norm": 1.34375,
"learning_rate": 6.091925271664156e-06,
"loss": 0.4614,
"step": 394
},
{
"epoch": 3.8990769230769233,
"grad_norm": 1.0625,
"learning_rate": 5.883346184541128e-06,
"loss": 0.4645,
"step": 396
},
{
"epoch": 3.9187692307692306,
"grad_norm": 1.234375,
"learning_rate": 5.67792317680616e-06,
"loss": 0.4533,
"step": 398
},
{
"epoch": 3.9384615384615387,
"grad_norm": 1.2421875,
"learning_rate": 5.475690162936489e-06,
"loss": 0.4232,
"step": 400
},
{
"epoch": 3.958153846153846,
"grad_norm": 2.046875,
"learning_rate": 5.27668053075474e-06,
"loss": 0.4266,
"step": 402
},
{
"epoch": 3.977846153846154,
"grad_norm": 1.2890625,
"learning_rate": 5.0809271359167215e-06,
"loss": 0.4529,
"step": 404
},
{
"epoch": 3.9975384615384613,
"grad_norm": 1.40625,
"learning_rate": 4.888462296487128e-06,
"loss": 0.4429,
"step": 406
},
{
"epoch": 4.017230769230769,
"grad_norm": 1.109375,
"learning_rate": 4.699317787603927e-06,
"loss": 0.4537,
"step": 408
},
{
"epoch": 4.036923076923077,
"grad_norm": 1.3046875,
"learning_rate": 4.513524836232458e-06,
"loss": 0.4659,
"step": 410
},
{
"epoch": 4.056615384615385,
"grad_norm": 1.09375,
"learning_rate": 4.331114116009938e-06,
"loss": 0.4156,
"step": 412
},
{
"epoch": 4.076307692307692,
"grad_norm": 1.3125,
"learning_rate": 4.152115742181434e-06,
"loss": 0.4561,
"step": 414
},
{
"epoch": 4.096,
"grad_norm": 1.5390625,
"learning_rate": 3.97655926662791e-06,
"loss": 0.4438,
"step": 416
},
{
"epoch": 4.115692307692307,
"grad_norm": 1.234375,
"learning_rate": 3.80447367298738e-06,
"loss": 0.4331,
"step": 418
},
{
"epoch": 4.135384615384615,
"grad_norm": 1.1328125,
"learning_rate": 3.6358873718697726e-06,
"loss": 0.4261,
"step": 420
},
{
"epoch": 4.155076923076923,
"grad_norm": 1.2890625,
"learning_rate": 3.470828196166523e-06,
"loss": 0.4629,
"step": 422
},
{
"epoch": 4.174769230769231,
"grad_norm": 1.1875,
"learning_rate": 3.3093233964554466e-06,
"loss": 0.4271,
"step": 424
},
{
"epoch": 4.194461538461539,
"grad_norm": 1.2578125,
"learning_rate": 3.151399636501773e-06,
"loss": 0.4229,
"step": 426
},
{
"epoch": 4.214153846153846,
"grad_norm": 1.28125,
"learning_rate": 2.997082988856087e-06,
"loss": 0.4504,
"step": 428
},
{
"epoch": 4.233846153846154,
"grad_norm": 1.4921875,
"learning_rate": 2.8463989305498596e-06,
"loss": 0.428,
"step": 430
},
{
"epoch": 4.2535384615384615,
"grad_norm": 1.3125,
"learning_rate": 2.699372338889297e-06,
"loss": 0.4399,
"step": 432
},
{
"epoch": 4.27323076923077,
"grad_norm": 1.2890625,
"learning_rate": 2.5560274873481975e-06,
"loss": 0.4375,
"step": 434
},
{
"epoch": 4.292923076923077,
"grad_norm": 1.1875,
"learning_rate": 2.416388041560491e-06,
"loss": 0.4231,
"step": 436
},
{
"epoch": 4.312615384615385,
"grad_norm": 1.2578125,
"learning_rate": 2.2804770554131686e-06,
"loss": 0.4409,
"step": 438
},
{
"epoch": 4.332307692307692,
"grad_norm": 1.25,
"learning_rate": 2.1483169672401686e-06,
"loss": 0.4693,
"step": 440
},
{
"epoch": 4.352,
"grad_norm": 1.234375,
"learning_rate": 2.0199295961178893e-06,
"loss": 0.4454,
"step": 442
},
{
"epoch": 4.3716923076923075,
"grad_norm": 1.40625,
"learning_rate": 1.895336138262968e-06,
"loss": 0.4543,
"step": 444
},
{
"epoch": 4.391384615384616,
"grad_norm": 1.1015625,
"learning_rate": 1.7745571635328723e-06,
"loss": 0.4302,
"step": 446
},
{
"epoch": 4.411076923076923,
"grad_norm": 1.125,
"learning_rate": 1.6576126120299045e-06,
"loss": 0.4325,
"step": 448
},
{
"epoch": 4.430769230769231,
"grad_norm": 1.2734375,
"learning_rate": 1.5445217908091613e-06,
"loss": 0.4406,
"step": 450
},
{
"epoch": 4.450461538461538,
"grad_norm": 1.21875,
"learning_rate": 1.4353033706910296e-06,
"loss": 0.4631,
"step": 452
},
{
"epoch": 4.470153846153846,
"grad_norm": 1.15625,
"learning_rate": 1.3299753831787192e-06,
"loss": 0.4466,
"step": 454
},
{
"epoch": 4.489846153846154,
"grad_norm": 1.4140625,
"learning_rate": 1.2285552174813225e-06,
"loss": 0.4379,
"step": 456
},
{
"epoch": 4.509538461538462,
"grad_norm": 1.171875,
"learning_rate": 1.131059617642935e-06,
"loss": 0.443,
"step": 458
},
{
"epoch": 4.529230769230769,
"grad_norm": 1.1796875,
"learning_rate": 1.0375046797782866e-06,
"loss": 0.4793,
"step": 460
},
{
"epoch": 4.548923076923077,
"grad_norm": 1.3046875,
"learning_rate": 9.479058494153425e-07,
"loss": 0.4512,
"step": 462
},
{
"epoch": 4.568615384615384,
"grad_norm": 1.1328125,
"learning_rate": 8.622779189453007e-07,
"loss": 0.4558,
"step": 464
},
{
"epoch": 4.588307692307692,
"grad_norm": 1.3125,
"learning_rate": 7.806350251804484e-07,
"loss": 0.4365,
"step": 466
},
{
"epoch": 4.608,
"grad_norm": 1.3359375,
"learning_rate": 7.029906470202046e-07,
"loss": 0.4499,
"step": 468
},
{
"epoch": 4.627692307692308,
"grad_norm": 1.421875,
"learning_rate": 6.293576032258413e-07,
"loss": 0.4228,
"step": 470
},
{
"epoch": 4.647384615384615,
"grad_norm": 1.3828125,
"learning_rate": 5.597480503041486e-07,
"loss": 0.4443,
"step": 472
},
{
"epoch": 4.667076923076923,
"grad_norm": 1.375,
"learning_rate": 4.941734805004289e-07,
"loss": 0.4462,
"step": 474
},
{
"epoch": 4.686769230769231,
"grad_norm": 1.140625,
"learning_rate": 4.326447199012068e-07,
"loss": 0.4136,
"step": 476
},
{
"epoch": 4.7064615384615385,
"grad_norm": 1.078125,
"learning_rate": 3.751719266468584e-07,
"loss": 0.418,
"step": 478
},
{
"epoch": 4.726153846153846,
"grad_norm": 1.28125,
"learning_rate": 3.217645892545695e-07,
"loss": 0.437,
"step": 480
},
{
"epoch": 4.745846153846154,
"grad_norm": 1.234375,
"learning_rate": 2.724315250518056e-07,
"loss": 0.4599,
"step": 482
},
{
"epoch": 4.765538461538462,
"grad_norm": 1.2578125,
"learning_rate": 2.271808787206092e-07,
"loss": 0.4741,
"step": 484
},
{
"epoch": 4.785230769230769,
"grad_norm": 1.0625,
"learning_rate": 1.860201209529483e-07,
"loss": 0.454,
"step": 486
},
{
"epoch": 4.804923076923077,
"grad_norm": 1.2421875,
"learning_rate": 1.489560472173468e-07,
"loss": 0.4625,
"step": 488
},
{
"epoch": 4.8246153846153845,
"grad_norm": 1.5390625,
"learning_rate": 1.1599477663696845e-07,
"loss": 0.443,
"step": 490
},
{
"epoch": 4.844307692307693,
"grad_norm": 1.109375,
"learning_rate": 8.714175097937204e-08,
"loss": 0.4617,
"step": 492
},
{
"epoch": 4.864,
"grad_norm": 1.2578125,
"learning_rate": 6.240173375811343e-08,
"loss": 0.4432,
"step": 494
},
{
"epoch": 4.883692307692308,
"grad_norm": 1.3125,
"learning_rate": 4.1778809446302304e-08,
"loss": 0.4661,
"step": 496
},
{
"epoch": 4.903384615384615,
"grad_norm": 1.1953125,
"learning_rate": 2.5276382802272292e-08,
"loss": 0.4307,
"step": 498
},
{
"epoch": 4.923076923076923,
"grad_norm": 1.2890625,
"learning_rate": 1.2897178307461067e-08,
"loss": 0.4554,
"step": 500
},
{
"epoch": 4.942769230769231,
"grad_norm": 1.140625,
"learning_rate": 4.6432397166285e-09,
"loss": 0.4637,
"step": 502
},
{
"epoch": 4.962461538461539,
"grad_norm": 1.3515625,
"learning_rate": 5.159297204238023e-10,
"loss": 0.459,
"step": 504
}
],
"logging_steps": 2,
"max_steps": 505,
"num_input_tokens_seen": 0,
"num_train_epochs": 5,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 1.4919954461790044e+18,
"train_batch_size": 2,
"trial_name": null,
"trial_params": null
}