tinybert-javanese / trainer_state.json
akahana's picture
End of training
62d84ca verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 30.0,
"eval_steps": 500,
"global_step": 71610,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.04189359028068706,
"grad_norm": 1.1499052047729492,
"learning_rate": 4.993017734953219e-05,
"loss": 9.973,
"step": 100
},
{
"epoch": 0.08378718056137412,
"grad_norm": 1.1211209297180176,
"learning_rate": 4.986035469906438e-05,
"loss": 9.315,
"step": 200
},
{
"epoch": 0.12568077084206117,
"grad_norm": 1.0281990766525269,
"learning_rate": 4.979053204859657e-05,
"loss": 8.8239,
"step": 300
},
{
"epoch": 0.16757436112274823,
"grad_norm": 1.1534616947174072,
"learning_rate": 4.972070939812876e-05,
"loss": 8.4254,
"step": 400
},
{
"epoch": 0.20946795140343527,
"grad_norm": 0.9906216859817505,
"learning_rate": 4.9650886747660944e-05,
"loss": 8.1225,
"step": 500
},
{
"epoch": 0.25136154168412234,
"grad_norm": 0.9559820294380188,
"learning_rate": 4.958106409719313e-05,
"loss": 7.8557,
"step": 600
},
{
"epoch": 0.2932551319648094,
"grad_norm": 0.7887147665023804,
"learning_rate": 4.951124144672532e-05,
"loss": 7.7282,
"step": 700
},
{
"epoch": 0.33514872224549647,
"grad_norm": 0.6530473232269287,
"learning_rate": 4.944141879625751e-05,
"loss": 7.6586,
"step": 800
},
{
"epoch": 0.3770423125261835,
"grad_norm": 0.6546779870986938,
"learning_rate": 4.9371596145789694e-05,
"loss": 7.6438,
"step": 900
},
{
"epoch": 0.41893590280687054,
"grad_norm": 0.5660322308540344,
"learning_rate": 4.9301773495321885e-05,
"loss": 7.6237,
"step": 1000
},
{
"epoch": 0.4608294930875576,
"grad_norm": 0.6094719171524048,
"learning_rate": 4.9231950844854076e-05,
"loss": 7.5937,
"step": 1100
},
{
"epoch": 0.5027230833682447,
"grad_norm": 0.6101346015930176,
"learning_rate": 4.916212819438626e-05,
"loss": 7.5661,
"step": 1200
},
{
"epoch": 0.5446166736489317,
"grad_norm": 0.5943477749824524,
"learning_rate": 4.909230554391845e-05,
"loss": 7.6007,
"step": 1300
},
{
"epoch": 0.5865102639296188,
"grad_norm": 0.6604776382446289,
"learning_rate": 4.902248289345064e-05,
"loss": 7.568,
"step": 1400
},
{
"epoch": 0.6284038542103059,
"grad_norm": 0.6151777505874634,
"learning_rate": 4.8952660242982826e-05,
"loss": 7.5521,
"step": 1500
},
{
"epoch": 0.6702974444909929,
"grad_norm": 0.6381381750106812,
"learning_rate": 4.888283759251501e-05,
"loss": 7.5478,
"step": 1600
},
{
"epoch": 0.7121910347716799,
"grad_norm": 0.6552081108093262,
"learning_rate": 4.88130149420472e-05,
"loss": 7.5293,
"step": 1700
},
{
"epoch": 0.754084625052367,
"grad_norm": 0.6973659992218018,
"learning_rate": 4.874319229157939e-05,
"loss": 7.4983,
"step": 1800
},
{
"epoch": 0.795978215333054,
"grad_norm": 0.8130584955215454,
"learning_rate": 4.8673369641111576e-05,
"loss": 7.5213,
"step": 1900
},
{
"epoch": 0.8378718056137411,
"grad_norm": 0.8530446887016296,
"learning_rate": 4.860354699064377e-05,
"loss": 7.5105,
"step": 2000
},
{
"epoch": 0.8797653958944281,
"grad_norm": 0.8059477210044861,
"learning_rate": 4.853372434017596e-05,
"loss": 7.508,
"step": 2100
},
{
"epoch": 0.9216589861751152,
"grad_norm": 0.7378331422805786,
"learning_rate": 4.846390168970814e-05,
"loss": 7.482,
"step": 2200
},
{
"epoch": 0.9635525764558023,
"grad_norm": 1.1823194026947021,
"learning_rate": 4.839407903924033e-05,
"loss": 7.4778,
"step": 2300
},
{
"epoch": 1.0054461667364893,
"grad_norm": 0.8547298908233643,
"learning_rate": 4.832425638877252e-05,
"loss": 7.4529,
"step": 2400
},
{
"epoch": 1.0473397570171763,
"grad_norm": 0.8535734415054321,
"learning_rate": 4.825443373830471e-05,
"loss": 7.4374,
"step": 2500
},
{
"epoch": 1.0892333472978635,
"grad_norm": 0.994597852230072,
"learning_rate": 4.818461108783689e-05,
"loss": 7.4306,
"step": 2600
},
{
"epoch": 1.1311269375785504,
"grad_norm": 1.2056490182876587,
"learning_rate": 4.8114788437369084e-05,
"loss": 7.4322,
"step": 2700
},
{
"epoch": 1.1730205278592376,
"grad_norm": 1.2451157569885254,
"learning_rate": 4.8044965786901275e-05,
"loss": 7.4205,
"step": 2800
},
{
"epoch": 1.2149141181399246,
"grad_norm": 0.9964780211448669,
"learning_rate": 4.797514313643346e-05,
"loss": 7.4031,
"step": 2900
},
{
"epoch": 1.2568077084206117,
"grad_norm": 0.8989804983139038,
"learning_rate": 4.790532048596565e-05,
"loss": 7.4043,
"step": 3000
},
{
"epoch": 1.2987012987012987,
"grad_norm": 1.1330469846725464,
"learning_rate": 4.783549783549784e-05,
"loss": 7.4031,
"step": 3100
},
{
"epoch": 1.3405948889819856,
"grad_norm": 0.9531299471855164,
"learning_rate": 4.7765675185030025e-05,
"loss": 7.3866,
"step": 3200
},
{
"epoch": 1.3824884792626728,
"grad_norm": 1.0342323780059814,
"learning_rate": 4.7695852534562216e-05,
"loss": 7.3477,
"step": 3300
},
{
"epoch": 1.42438206954336,
"grad_norm": 1.0523111820220947,
"learning_rate": 4.76260298840944e-05,
"loss": 7.3726,
"step": 3400
},
{
"epoch": 1.466275659824047,
"grad_norm": 1.298751711845398,
"learning_rate": 4.755620723362659e-05,
"loss": 7.3484,
"step": 3500
},
{
"epoch": 1.508169250104734,
"grad_norm": 1.0065233707427979,
"learning_rate": 4.7486384583158775e-05,
"loss": 7.3567,
"step": 3600
},
{
"epoch": 1.550062840385421,
"grad_norm": 1.2989579439163208,
"learning_rate": 4.7416561932690966e-05,
"loss": 7.3275,
"step": 3700
},
{
"epoch": 1.591956430666108,
"grad_norm": 1.0343406200408936,
"learning_rate": 4.734673928222316e-05,
"loss": 7.3353,
"step": 3800
},
{
"epoch": 1.6338500209467952,
"grad_norm": 0.9944115281105042,
"learning_rate": 4.727691663175534e-05,
"loss": 7.3304,
"step": 3900
},
{
"epoch": 1.6757436112274822,
"grad_norm": 1.102974534034729,
"learning_rate": 4.720709398128753e-05,
"loss": 7.3128,
"step": 4000
},
{
"epoch": 1.7176372015081691,
"grad_norm": 1.112282156944275,
"learning_rate": 4.713727133081972e-05,
"loss": 7.333,
"step": 4100
},
{
"epoch": 1.7595307917888563,
"grad_norm": 1.2143328189849854,
"learning_rate": 4.7067448680351914e-05,
"loss": 7.3342,
"step": 4200
},
{
"epoch": 1.8014243820695435,
"grad_norm": 1.1656922101974487,
"learning_rate": 4.69976260298841e-05,
"loss": 7.2995,
"step": 4300
},
{
"epoch": 1.8433179723502304,
"grad_norm": 1.2085694074630737,
"learning_rate": 4.692780337941628e-05,
"loss": 7.2939,
"step": 4400
},
{
"epoch": 1.8852115626309174,
"grad_norm": 1.1366217136383057,
"learning_rate": 4.685798072894847e-05,
"loss": 7.3079,
"step": 4500
},
{
"epoch": 1.9271051529116046,
"grad_norm": 1.5368098020553589,
"learning_rate": 4.678815807848066e-05,
"loss": 7.2682,
"step": 4600
},
{
"epoch": 1.9689987431922917,
"grad_norm": 1.1548763513565063,
"learning_rate": 4.671833542801285e-05,
"loss": 7.2708,
"step": 4700
},
{
"epoch": 2.0108923334729787,
"grad_norm": 1.1510928869247437,
"learning_rate": 4.664851277754504e-05,
"loss": 7.2794,
"step": 4800
},
{
"epoch": 2.0527859237536656,
"grad_norm": 1.103461503982544,
"learning_rate": 4.6578690127077224e-05,
"loss": 7.2612,
"step": 4900
},
{
"epoch": 2.0946795140343526,
"grad_norm": 1.2767215967178345,
"learning_rate": 4.6508867476609414e-05,
"loss": 7.2436,
"step": 5000
},
{
"epoch": 2.13657310431504,
"grad_norm": 1.3062710762023926,
"learning_rate": 4.6439044826141605e-05,
"loss": 7.2162,
"step": 5100
},
{
"epoch": 2.178466694595727,
"grad_norm": 1.2461299896240234,
"learning_rate": 4.6369222175673796e-05,
"loss": 7.2387,
"step": 5200
},
{
"epoch": 2.220360284876414,
"grad_norm": 1.5427358150482178,
"learning_rate": 4.629939952520598e-05,
"loss": 7.2004,
"step": 5300
},
{
"epoch": 2.262253875157101,
"grad_norm": 1.390331506729126,
"learning_rate": 4.6229576874738165e-05,
"loss": 7.2403,
"step": 5400
},
{
"epoch": 2.3041474654377883,
"grad_norm": 1.4087032079696655,
"learning_rate": 4.6159754224270356e-05,
"loss": 7.2157,
"step": 5500
},
{
"epoch": 2.346041055718475,
"grad_norm": 1.2417359352111816,
"learning_rate": 4.608993157380254e-05,
"loss": 7.2277,
"step": 5600
},
{
"epoch": 2.387934645999162,
"grad_norm": 1.4267281293869019,
"learning_rate": 4.602010892333473e-05,
"loss": 7.1999,
"step": 5700
},
{
"epoch": 2.429828236279849,
"grad_norm": 1.3897684812545776,
"learning_rate": 4.595028627286692e-05,
"loss": 7.2155,
"step": 5800
},
{
"epoch": 2.471721826560536,
"grad_norm": 1.326821208000183,
"learning_rate": 4.5880463622399106e-05,
"loss": 7.1705,
"step": 5900
},
{
"epoch": 2.5136154168412235,
"grad_norm": 1.2585749626159668,
"learning_rate": 4.58106409719313e-05,
"loss": 7.1787,
"step": 6000
},
{
"epoch": 2.5555090071219104,
"grad_norm": 1.4856244325637817,
"learning_rate": 4.574081832146349e-05,
"loss": 7.1923,
"step": 6100
},
{
"epoch": 2.5974025974025974,
"grad_norm": 1.2883421182632446,
"learning_rate": 4.567099567099568e-05,
"loss": 7.1776,
"step": 6200
},
{
"epoch": 2.6392961876832843,
"grad_norm": 1.4935518503189087,
"learning_rate": 4.560117302052786e-05,
"loss": 7.1711,
"step": 6300
},
{
"epoch": 2.6811897779639713,
"grad_norm": 1.3920152187347412,
"learning_rate": 4.553135037006005e-05,
"loss": 7.1292,
"step": 6400
},
{
"epoch": 2.7230833682446587,
"grad_norm": 1.2802495956420898,
"learning_rate": 4.546152771959224e-05,
"loss": 7.1558,
"step": 6500
},
{
"epoch": 2.7649769585253456,
"grad_norm": 1.4111789464950562,
"learning_rate": 4.5392403295629106e-05,
"loss": 7.172,
"step": 6600
},
{
"epoch": 2.8068705488060326,
"grad_norm": 1.6390964984893799,
"learning_rate": 4.53225806451613e-05,
"loss": 7.1263,
"step": 6700
},
{
"epoch": 2.84876413908672,
"grad_norm": 1.4132812023162842,
"learning_rate": 4.525275799469348e-05,
"loss": 7.1259,
"step": 6800
},
{
"epoch": 2.890657729367407,
"grad_norm": 1.4943978786468506,
"learning_rate": 4.518293534422567e-05,
"loss": 7.1252,
"step": 6900
},
{
"epoch": 2.932551319648094,
"grad_norm": 1.3022414445877075,
"learning_rate": 4.5113112693757856e-05,
"loss": 7.107,
"step": 7000
},
{
"epoch": 2.974444909928781,
"grad_norm": 1.4270446300506592,
"learning_rate": 4.504329004329004e-05,
"loss": 7.1278,
"step": 7100
},
{
"epoch": 3.016338500209468,
"grad_norm": 1.3672137260437012,
"learning_rate": 4.497346739282223e-05,
"loss": 7.1282,
"step": 7200
},
{
"epoch": 3.058232090490155,
"grad_norm": 1.955368995666504,
"learning_rate": 4.490364474235442e-05,
"loss": 7.1123,
"step": 7300
},
{
"epoch": 3.100125680770842,
"grad_norm": 1.3990498781204224,
"learning_rate": 4.483382209188661e-05,
"loss": 7.1117,
"step": 7400
},
{
"epoch": 3.142019271051529,
"grad_norm": 1.6294671297073364,
"learning_rate": 4.47639994414188e-05,
"loss": 7.0721,
"step": 7500
},
{
"epoch": 3.183912861332216,
"grad_norm": 1.3939063549041748,
"learning_rate": 4.469417679095099e-05,
"loss": 7.0599,
"step": 7600
},
{
"epoch": 3.225806451612903,
"grad_norm": 1.918155312538147,
"learning_rate": 4.462435414048318e-05,
"loss": 7.0759,
"step": 7700
},
{
"epoch": 3.2677000418935904,
"grad_norm": 1.3072093725204468,
"learning_rate": 4.455453149001536e-05,
"loss": 7.0784,
"step": 7800
},
{
"epoch": 3.3095936321742774,
"grad_norm": 1.3794573545455933,
"learning_rate": 4.4484708839547554e-05,
"loss": 7.0726,
"step": 7900
},
{
"epoch": 3.3514872224549643,
"grad_norm": 1.4223533868789673,
"learning_rate": 4.441488618907974e-05,
"loss": 7.0513,
"step": 8000
},
{
"epoch": 3.3933808127356513,
"grad_norm": 1.695520043373108,
"learning_rate": 4.434506353861192e-05,
"loss": 7.0586,
"step": 8100
},
{
"epoch": 3.4352744030163387,
"grad_norm": 2.290275812149048,
"learning_rate": 4.4275240888144113e-05,
"loss": 7.0429,
"step": 8200
},
{
"epoch": 3.4771679932970256,
"grad_norm": 1.5322943925857544,
"learning_rate": 4.4205418237676304e-05,
"loss": 7.0196,
"step": 8300
},
{
"epoch": 3.5190615835777126,
"grad_norm": 1.4767639636993408,
"learning_rate": 4.4135595587208495e-05,
"loss": 7.0643,
"step": 8400
},
{
"epoch": 3.5609551738583995,
"grad_norm": 1.4343881607055664,
"learning_rate": 4.406577293674068e-05,
"loss": 7.0153,
"step": 8500
},
{
"epoch": 3.602848764139087,
"grad_norm": 1.7641897201538086,
"learning_rate": 4.399664851277755e-05,
"loss": 7.013,
"step": 8600
},
{
"epoch": 3.644742354419774,
"grad_norm": 1.9688279628753662,
"learning_rate": 4.392682586230973e-05,
"loss": 6.9985,
"step": 8700
},
{
"epoch": 3.686635944700461,
"grad_norm": 1.7434871196746826,
"learning_rate": 4.385700321184192e-05,
"loss": 7.0142,
"step": 8800
},
{
"epoch": 3.728529534981148,
"grad_norm": 1.550470232963562,
"learning_rate": 4.378718056137411e-05,
"loss": 6.9975,
"step": 8900
},
{
"epoch": 3.7704231252618348,
"grad_norm": 1.759869933128357,
"learning_rate": 4.37173579109063e-05,
"loss": 6.9821,
"step": 9000
},
{
"epoch": 3.812316715542522,
"grad_norm": 2.052905797958374,
"learning_rate": 4.364753526043849e-05,
"loss": 6.9917,
"step": 9100
},
{
"epoch": 3.854210305823209,
"grad_norm": 1.8859872817993164,
"learning_rate": 4.357771260997068e-05,
"loss": 6.9934,
"step": 9200
},
{
"epoch": 3.896103896103896,
"grad_norm": 1.8349354267120361,
"learning_rate": 4.3507889959502863e-05,
"loss": 6.9861,
"step": 9300
},
{
"epoch": 3.937997486384583,
"grad_norm": 2.519893169403076,
"learning_rate": 4.3438067309035054e-05,
"loss": 6.9611,
"step": 9400
},
{
"epoch": 3.97989107666527,
"grad_norm": 1.506072759628296,
"learning_rate": 4.336824465856724e-05,
"loss": 6.9688,
"step": 9500
},
{
"epoch": 4.021784666945957,
"grad_norm": 1.5342004299163818,
"learning_rate": 4.329842200809943e-05,
"loss": 6.9582,
"step": 9600
},
{
"epoch": 4.063678257226645,
"grad_norm": 1.6476861238479614,
"learning_rate": 4.3228599357631614e-05,
"loss": 6.9712,
"step": 9700
},
{
"epoch": 4.105571847507331,
"grad_norm": 2.112595319747925,
"learning_rate": 4.3158776707163805e-05,
"loss": 6.9568,
"step": 9800
},
{
"epoch": 4.147465437788019,
"grad_norm": 2.390101194381714,
"learning_rate": 4.3088954056695996e-05,
"loss": 6.9501,
"step": 9900
},
{
"epoch": 4.189359028068705,
"grad_norm": 1.900177240371704,
"learning_rate": 4.301913140622818e-05,
"loss": 6.935,
"step": 10000
},
{
"epoch": 4.231252618349393,
"grad_norm": 1.7032443284988403,
"learning_rate": 4.294930875576037e-05,
"loss": 6.9343,
"step": 10100
},
{
"epoch": 4.27314620863008,
"grad_norm": 1.8393847942352295,
"learning_rate": 4.287948610529256e-05,
"loss": 6.8927,
"step": 10200
},
{
"epoch": 4.3150397989107665,
"grad_norm": 2.046727180480957,
"learning_rate": 4.280966345482475e-05,
"loss": 6.9156,
"step": 10300
},
{
"epoch": 4.356933389191454,
"grad_norm": 1.832216501235962,
"learning_rate": 4.273984080435694e-05,
"loss": 6.89,
"step": 10400
},
{
"epoch": 4.39882697947214,
"grad_norm": 1.8682448863983154,
"learning_rate": 4.267001815388912e-05,
"loss": 6.8995,
"step": 10500
},
{
"epoch": 4.440720569752828,
"grad_norm": 2.0732340812683105,
"learning_rate": 4.260089372992599e-05,
"loss": 6.9094,
"step": 10600
},
{
"epoch": 4.482614160033515,
"grad_norm": 1.6016206741333008,
"learning_rate": 4.253107107945818e-05,
"loss": 6.913,
"step": 10700
},
{
"epoch": 4.524507750314202,
"grad_norm": 2.063062906265259,
"learning_rate": 4.246124842899037e-05,
"loss": 6.8943,
"step": 10800
},
{
"epoch": 4.566401340594889,
"grad_norm": 1.9563026428222656,
"learning_rate": 4.2391425778522555e-05,
"loss": 6.8986,
"step": 10900
},
{
"epoch": 4.6082949308755765,
"grad_norm": 1.8872498273849487,
"learning_rate": 4.2321603128054746e-05,
"loss": 6.8883,
"step": 11000
},
{
"epoch": 4.650188521156263,
"grad_norm": 2.1376144886016846,
"learning_rate": 4.225178047758693e-05,
"loss": 6.8716,
"step": 11100
},
{
"epoch": 4.69208211143695,
"grad_norm": 1.938679575920105,
"learning_rate": 4.218195782711912e-05,
"loss": 6.8836,
"step": 11200
},
{
"epoch": 4.733975701717637,
"grad_norm": 1.9372957944869995,
"learning_rate": 4.2112135176651305e-05,
"loss": 6.8925,
"step": 11300
},
{
"epoch": 4.775869291998324,
"grad_norm": 2.716827630996704,
"learning_rate": 4.2042312526183496e-05,
"loss": 6.8284,
"step": 11400
},
{
"epoch": 4.817762882279011,
"grad_norm": 1.942700743675232,
"learning_rate": 4.197248987571568e-05,
"loss": 6.8753,
"step": 11500
},
{
"epoch": 4.859656472559698,
"grad_norm": 2.026385545730591,
"learning_rate": 4.190266722524787e-05,
"loss": 6.8707,
"step": 11600
},
{
"epoch": 4.901550062840386,
"grad_norm": 1.7594517469406128,
"learning_rate": 4.183284457478006e-05,
"loss": 6.8427,
"step": 11700
},
{
"epoch": 4.943443653121072,
"grad_norm": 1.8161801099777222,
"learning_rate": 4.176302192431225e-05,
"loss": 6.8519,
"step": 11800
},
{
"epoch": 4.9853372434017595,
"grad_norm": 2.6034481525421143,
"learning_rate": 4.169319927384444e-05,
"loss": 6.8448,
"step": 11900
},
{
"epoch": 5.027230833682447,
"grad_norm": 1.93776535987854,
"learning_rate": 4.162337662337663e-05,
"loss": 6.8071,
"step": 12000
},
{
"epoch": 5.0691244239631335,
"grad_norm": 2.0754964351654053,
"learning_rate": 4.155355397290881e-05,
"loss": 6.8386,
"step": 12100
},
{
"epoch": 5.111018014243821,
"grad_norm": 2.0640342235565186,
"learning_rate": 4.1483731322440996e-05,
"loss": 6.8402,
"step": 12200
},
{
"epoch": 5.152911604524507,
"grad_norm": 1.8218064308166504,
"learning_rate": 4.141390867197319e-05,
"loss": 6.8153,
"step": 12300
},
{
"epoch": 5.194805194805195,
"grad_norm": 2.0181634426116943,
"learning_rate": 4.134408602150538e-05,
"loss": 6.8104,
"step": 12400
},
{
"epoch": 5.236698785085882,
"grad_norm": 2.5224316120147705,
"learning_rate": 4.127426337103757e-05,
"loss": 6.8355,
"step": 12500
},
{
"epoch": 5.278592375366569,
"grad_norm": 3.1008002758026123,
"learning_rate": 4.120513894707444e-05,
"loss": 6.8384,
"step": 12600
},
{
"epoch": 5.320485965647256,
"grad_norm": 1.8872394561767578,
"learning_rate": 4.113531629660662e-05,
"loss": 6.8087,
"step": 12700
},
{
"epoch": 5.362379555927943,
"grad_norm": 2.109281063079834,
"learning_rate": 4.1065493646138805e-05,
"loss": 6.8161,
"step": 12800
},
{
"epoch": 5.40427314620863,
"grad_norm": 1.7881128787994385,
"learning_rate": 4.0995670995670996e-05,
"loss": 6.8215,
"step": 12900
},
{
"epoch": 5.446166736489317,
"grad_norm": 2.5179624557495117,
"learning_rate": 4.092584834520319e-05,
"loss": 6.7883,
"step": 13000
},
{
"epoch": 5.488060326770004,
"grad_norm": 2.4349751472473145,
"learning_rate": 4.085602569473537e-05,
"loss": 6.792,
"step": 13100
},
{
"epoch": 5.529953917050691,
"grad_norm": 2.011018991470337,
"learning_rate": 4.078620304426756e-05,
"loss": 6.7846,
"step": 13200
},
{
"epoch": 5.571847507331379,
"grad_norm": 2.519958019256592,
"learning_rate": 4.071638039379975e-05,
"loss": 6.7887,
"step": 13300
},
{
"epoch": 5.613741097612065,
"grad_norm": 1.9241886138916016,
"learning_rate": 4.064655774333194e-05,
"loss": 6.7662,
"step": 13400
},
{
"epoch": 5.655634687892753,
"grad_norm": 1.8995391130447388,
"learning_rate": 4.057673509286413e-05,
"loss": 6.7672,
"step": 13500
},
{
"epoch": 5.697528278173439,
"grad_norm": 2.1511363983154297,
"learning_rate": 4.050691244239632e-05,
"loss": 6.7867,
"step": 13600
},
{
"epoch": 5.7394218684541265,
"grad_norm": 1.8995012044906616,
"learning_rate": 4.04370897919285e-05,
"loss": 6.7563,
"step": 13700
},
{
"epoch": 5.781315458734814,
"grad_norm": 1.83163321018219,
"learning_rate": 4.036726714146069e-05,
"loss": 6.7848,
"step": 13800
},
{
"epoch": 5.8232090490155,
"grad_norm": 2.2616159915924072,
"learning_rate": 4.029744449099288e-05,
"loss": 6.7896,
"step": 13900
},
{
"epoch": 5.865102639296188,
"grad_norm": 2.0548572540283203,
"learning_rate": 4.0228320067029746e-05,
"loss": 6.7633,
"step": 14000
},
{
"epoch": 5.906996229576874,
"grad_norm": 2.4749302864074707,
"learning_rate": 4.015849741656194e-05,
"loss": 6.7267,
"step": 14100
},
{
"epoch": 5.948889819857562,
"grad_norm": 1.906648874282837,
"learning_rate": 4.008867476609413e-05,
"loss": 6.7645,
"step": 14200
},
{
"epoch": 5.990783410138249,
"grad_norm": 2.0839619636535645,
"learning_rate": 4.001885211562631e-05,
"loss": 6.8082,
"step": 14300
},
{
"epoch": 6.032677000418936,
"grad_norm": 2.1202664375305176,
"learning_rate": 3.9949029465158496e-05,
"loss": 6.7625,
"step": 14400
},
{
"epoch": 6.074570590699623,
"grad_norm": 1.988951563835144,
"learning_rate": 3.987920681469069e-05,
"loss": 6.7413,
"step": 14500
},
{
"epoch": 6.11646418098031,
"grad_norm": 2.4327659606933594,
"learning_rate": 3.980938416422287e-05,
"loss": 6.7123,
"step": 14600
},
{
"epoch": 6.158357771260997,
"grad_norm": 2.07710599899292,
"learning_rate": 3.973956151375506e-05,
"loss": 6.7316,
"step": 14700
},
{
"epoch": 6.200251361541684,
"grad_norm": 1.9640876054763794,
"learning_rate": 3.966973886328725e-05,
"loss": 6.752,
"step": 14800
},
{
"epoch": 6.242144951822371,
"grad_norm": 2.3012888431549072,
"learning_rate": 3.959991621281944e-05,
"loss": 6.7188,
"step": 14900
},
{
"epoch": 6.284038542103058,
"grad_norm": 2.0262773036956787,
"learning_rate": 3.953009356235163e-05,
"loss": 6.7255,
"step": 15000
},
{
"epoch": 6.325932132383746,
"grad_norm": 1.8689815998077393,
"learning_rate": 3.946027091188382e-05,
"loss": 6.7132,
"step": 15100
},
{
"epoch": 6.367825722664432,
"grad_norm": 2.188612937927246,
"learning_rate": 3.939044826141601e-05,
"loss": 6.7407,
"step": 15200
},
{
"epoch": 6.4097193129451195,
"grad_norm": 2.0168368816375732,
"learning_rate": 3.9320625610948195e-05,
"loss": 6.7132,
"step": 15300
},
{
"epoch": 6.451612903225806,
"grad_norm": 2.496889352798462,
"learning_rate": 3.925080296048038e-05,
"loss": 6.7003,
"step": 15400
},
{
"epoch": 6.4935064935064934,
"grad_norm": 2.1601486206054688,
"learning_rate": 3.918098031001257e-05,
"loss": 6.7056,
"step": 15500
},
{
"epoch": 6.535400083787181,
"grad_norm": 2.300112009048462,
"learning_rate": 3.9111157659544754e-05,
"loss": 6.7314,
"step": 15600
},
{
"epoch": 6.577293674067867,
"grad_norm": 2.321880578994751,
"learning_rate": 3.9041335009076945e-05,
"loss": 6.7166,
"step": 15700
},
{
"epoch": 6.619187264348555,
"grad_norm": 2.029465913772583,
"learning_rate": 3.8971512358609136e-05,
"loss": 6.6908,
"step": 15800
},
{
"epoch": 6.661080854629242,
"grad_norm": 2.258577585220337,
"learning_rate": 3.890168970814133e-05,
"loss": 6.7359,
"step": 15900
},
{
"epoch": 6.702974444909929,
"grad_norm": 2.3579437732696533,
"learning_rate": 3.883186705767351e-05,
"loss": 6.7021,
"step": 16000
},
{
"epoch": 6.744868035190616,
"grad_norm": 2.236828565597534,
"learning_rate": 3.87620444072057e-05,
"loss": 6.6897,
"step": 16100
},
{
"epoch": 6.786761625471303,
"grad_norm": 2.6255593299865723,
"learning_rate": 3.869222175673789e-05,
"loss": 6.6899,
"step": 16200
},
{
"epoch": 6.82865521575199,
"grad_norm": 2.297067880630493,
"learning_rate": 3.862239910627008e-05,
"loss": 6.7058,
"step": 16300
},
{
"epoch": 6.870548806032677,
"grad_norm": 2.440605640411377,
"learning_rate": 3.8553274682306945e-05,
"loss": 6.6559,
"step": 16400
},
{
"epoch": 6.912442396313364,
"grad_norm": 2.0427000522613525,
"learning_rate": 3.848345203183913e-05,
"loss": 6.6799,
"step": 16500
},
{
"epoch": 6.954335986594051,
"grad_norm": 2.0323081016540527,
"learning_rate": 3.841362938137132e-05,
"loss": 6.6863,
"step": 16600
},
{
"epoch": 6.996229576874738,
"grad_norm": 3.407731533050537,
"learning_rate": 3.834380673090351e-05,
"loss": 6.6767,
"step": 16700
},
{
"epoch": 7.038123167155425,
"grad_norm": 2.112870931625366,
"learning_rate": 3.8273984080435695e-05,
"loss": 6.682,
"step": 16800
},
{
"epoch": 7.080016757436113,
"grad_norm": 2.710810422897339,
"learning_rate": 3.8204161429967886e-05,
"loss": 6.7046,
"step": 16900
},
{
"epoch": 7.121910347716799,
"grad_norm": 2.0754942893981934,
"learning_rate": 3.813433877950007e-05,
"loss": 6.6511,
"step": 17000
},
{
"epoch": 7.1638039379974865,
"grad_norm": 3.1009552478790283,
"learning_rate": 3.8064516129032254e-05,
"loss": 6.666,
"step": 17100
},
{
"epoch": 7.205697528278174,
"grad_norm": 2.1582441329956055,
"learning_rate": 3.7994693478564445e-05,
"loss": 6.6574,
"step": 17200
},
{
"epoch": 7.24759111855886,
"grad_norm": 2.680147647857666,
"learning_rate": 3.7924870828096636e-05,
"loss": 6.6814,
"step": 17300
},
{
"epoch": 7.289484708839548,
"grad_norm": 2.0264320373535156,
"learning_rate": 3.785504817762883e-05,
"loss": 6.668,
"step": 17400
},
{
"epoch": 7.331378299120234,
"grad_norm": 2.032093048095703,
"learning_rate": 3.778522552716101e-05,
"loss": 6.6603,
"step": 17500
},
{
"epoch": 7.373271889400922,
"grad_norm": 2.4837894439697266,
"learning_rate": 3.77154028766932e-05,
"loss": 6.6817,
"step": 17600
},
{
"epoch": 7.415165479681609,
"grad_norm": 2.70166015625,
"learning_rate": 3.764558022622539e-05,
"loss": 6.6657,
"step": 17700
},
{
"epoch": 7.457059069962296,
"grad_norm": 2.3508477210998535,
"learning_rate": 3.757575757575758e-05,
"loss": 6.6314,
"step": 17800
},
{
"epoch": 7.498952660242983,
"grad_norm": 2.450437307357788,
"learning_rate": 3.750593492528977e-05,
"loss": 6.6551,
"step": 17900
},
{
"epoch": 7.5408462505236695,
"grad_norm": 1.9939864873886108,
"learning_rate": 3.743611227482195e-05,
"loss": 6.6128,
"step": 18000
},
{
"epoch": 7.582739840804357,
"grad_norm": 2.470285177230835,
"learning_rate": 3.736628962435414e-05,
"loss": 6.6126,
"step": 18100
},
{
"epoch": 7.624633431085044,
"grad_norm": 2.5651469230651855,
"learning_rate": 3.729646697388633e-05,
"loss": 6.6694,
"step": 18200
},
{
"epoch": 7.666527021365731,
"grad_norm": 2.361785650253296,
"learning_rate": 3.722664432341852e-05,
"loss": 6.6349,
"step": 18300
},
{
"epoch": 7.708420611646418,
"grad_norm": 2.371994972229004,
"learning_rate": 3.715682167295071e-05,
"loss": 6.6483,
"step": 18400
},
{
"epoch": 7.750314201927106,
"grad_norm": 2.862107038497925,
"learning_rate": 3.708769724898758e-05,
"loss": 6.634,
"step": 18500
},
{
"epoch": 7.792207792207792,
"grad_norm": 2.815486192703247,
"learning_rate": 3.701787459851976e-05,
"loss": 6.6324,
"step": 18600
},
{
"epoch": 7.8341013824884795,
"grad_norm": 1.930017352104187,
"learning_rate": 3.6948051948051945e-05,
"loss": 6.6275,
"step": 18700
},
{
"epoch": 7.875994972769166,
"grad_norm": 3.1758625507354736,
"learning_rate": 3.6878229297584136e-05,
"loss": 6.6529,
"step": 18800
},
{
"epoch": 7.9178885630498534,
"grad_norm": 2.1219429969787598,
"learning_rate": 3.680840664711633e-05,
"loss": 6.6085,
"step": 18900
},
{
"epoch": 7.95978215333054,
"grad_norm": 2.1965785026550293,
"learning_rate": 3.673858399664851e-05,
"loss": 6.6206,
"step": 19000
},
{
"epoch": 8.001675743611228,
"grad_norm": 2.489473581314087,
"learning_rate": 3.66687613461807e-05,
"loss": 6.6089,
"step": 19100
},
{
"epoch": 8.043569333891915,
"grad_norm": 2.3411850929260254,
"learning_rate": 3.659893869571289e-05,
"loss": 6.6286,
"step": 19200
},
{
"epoch": 8.085462924172601,
"grad_norm": 2.32071590423584,
"learning_rate": 3.6529116045245084e-05,
"loss": 6.5984,
"step": 19300
},
{
"epoch": 8.12735651445329,
"grad_norm": 2.402956247329712,
"learning_rate": 3.645929339477727e-05,
"loss": 6.5952,
"step": 19400
},
{
"epoch": 8.169250104733976,
"grad_norm": 2.6951029300689697,
"learning_rate": 3.638947074430946e-05,
"loss": 6.6106,
"step": 19500
},
{
"epoch": 8.211143695014663,
"grad_norm": 2.807187080383301,
"learning_rate": 3.6319648093841643e-05,
"loss": 6.6109,
"step": 19600
},
{
"epoch": 8.253037285295349,
"grad_norm": 2.798614025115967,
"learning_rate": 3.624982544337383e-05,
"loss": 6.6052,
"step": 19700
},
{
"epoch": 8.294930875576037,
"grad_norm": 4.015589237213135,
"learning_rate": 3.618000279290602e-05,
"loss": 6.5995,
"step": 19800
},
{
"epoch": 8.336824465856724,
"grad_norm": 2.6923959255218506,
"learning_rate": 3.611018014243821e-05,
"loss": 6.5855,
"step": 19900
},
{
"epoch": 8.37871805613741,
"grad_norm": 2.112994909286499,
"learning_rate": 3.6040357491970394e-05,
"loss": 6.5968,
"step": 20000
},
{
"epoch": 8.420611646418099,
"grad_norm": 2.8196451663970947,
"learning_rate": 3.5970534841502585e-05,
"loss": 6.5977,
"step": 20100
},
{
"epoch": 8.462505236698785,
"grad_norm": 2.2421326637268066,
"learning_rate": 3.5900712191034776e-05,
"loss": 6.5846,
"step": 20200
},
{
"epoch": 8.504398826979472,
"grad_norm": 2.634634256362915,
"learning_rate": 3.583088954056697e-05,
"loss": 6.5955,
"step": 20300
},
{
"epoch": 8.54629241726016,
"grad_norm": 2.101125955581665,
"learning_rate": 3.576106689009915e-05,
"loss": 6.6013,
"step": 20400
},
{
"epoch": 8.588186007540846,
"grad_norm": 2.719330072402954,
"learning_rate": 3.569194246613601e-05,
"loss": 6.5668,
"step": 20500
},
{
"epoch": 8.630079597821533,
"grad_norm": 2.283790349960327,
"learning_rate": 3.56221198156682e-05,
"loss": 6.6107,
"step": 20600
},
{
"epoch": 8.671973188102221,
"grad_norm": 2.1805171966552734,
"learning_rate": 3.5552297165200393e-05,
"loss": 6.5875,
"step": 20700
},
{
"epoch": 8.713866778382908,
"grad_norm": 2.6632487773895264,
"learning_rate": 3.5482474514732584e-05,
"loss": 6.613,
"step": 20800
},
{
"epoch": 8.755760368663594,
"grad_norm": 2.3296337127685547,
"learning_rate": 3.541265186426477e-05,
"loss": 6.5628,
"step": 20900
},
{
"epoch": 8.79765395894428,
"grad_norm": 2.8429343700408936,
"learning_rate": 3.534282921379696e-05,
"loss": 6.5823,
"step": 21000
},
{
"epoch": 8.839547549224969,
"grad_norm": 2.4361233711242676,
"learning_rate": 3.527300656332915e-05,
"loss": 6.5853,
"step": 21100
},
{
"epoch": 8.881441139505656,
"grad_norm": 2.5633111000061035,
"learning_rate": 3.5203183912861335e-05,
"loss": 6.5979,
"step": 21200
},
{
"epoch": 8.923334729786342,
"grad_norm": 2.350463628768921,
"learning_rate": 3.513336126239352e-05,
"loss": 6.5744,
"step": 21300
},
{
"epoch": 8.96522832006703,
"grad_norm": 2.456291675567627,
"learning_rate": 3.506353861192571e-05,
"loss": 6.57,
"step": 21400
},
{
"epoch": 9.007121910347717,
"grad_norm": 2.401036262512207,
"learning_rate": 3.49937159614579e-05,
"loss": 6.5614,
"step": 21500
},
{
"epoch": 9.049015500628403,
"grad_norm": 2.5537233352661133,
"learning_rate": 3.4923893310990085e-05,
"loss": 6.5836,
"step": 21600
},
{
"epoch": 9.090909090909092,
"grad_norm": 2.6386375427246094,
"learning_rate": 3.4854070660522276e-05,
"loss": 6.6178,
"step": 21700
},
{
"epoch": 9.132802681189778,
"grad_norm": 2.508533477783203,
"learning_rate": 3.478424801005447e-05,
"loss": 6.5761,
"step": 21800
},
{
"epoch": 9.174696271470465,
"grad_norm": 3.1510419845581055,
"learning_rate": 3.471442535958665e-05,
"loss": 6.558,
"step": 21900
},
{
"epoch": 9.216589861751151,
"grad_norm": 2.6325526237487793,
"learning_rate": 3.464460270911884e-05,
"loss": 6.5661,
"step": 22000
},
{
"epoch": 9.25848345203184,
"grad_norm": 2.9870827198028564,
"learning_rate": 3.457478005865103e-05,
"loss": 6.5392,
"step": 22100
},
{
"epoch": 9.300377042312526,
"grad_norm": 2.4924209117889404,
"learning_rate": 3.450495740818322e-05,
"loss": 6.547,
"step": 22200
},
{
"epoch": 9.342270632593213,
"grad_norm": 2.3227298259735107,
"learning_rate": 3.44351347577154e-05,
"loss": 6.5306,
"step": 22300
},
{
"epoch": 9.3841642228739,
"grad_norm": 2.867182731628418,
"learning_rate": 3.436531210724759e-05,
"loss": 6.5628,
"step": 22400
},
{
"epoch": 9.426057813154587,
"grad_norm": 2.2619149684906006,
"learning_rate": 3.429548945677978e-05,
"loss": 6.5192,
"step": 22500
},
{
"epoch": 9.467951403435274,
"grad_norm": 2.232321262359619,
"learning_rate": 3.422636503281665e-05,
"loss": 6.56,
"step": 22600
},
{
"epoch": 9.509844993715962,
"grad_norm": 2.4485862255096436,
"learning_rate": 3.4156542382348835e-05,
"loss": 6.557,
"step": 22700
},
{
"epoch": 9.551738583996649,
"grad_norm": 2.4476943016052246,
"learning_rate": 3.4086719731881026e-05,
"loss": 6.5314,
"step": 22800
},
{
"epoch": 9.593632174277335,
"grad_norm": 2.491731643676758,
"learning_rate": 3.401689708141321e-05,
"loss": 6.4952,
"step": 22900
},
{
"epoch": 9.635525764558023,
"grad_norm": 2.6474783420562744,
"learning_rate": 3.39470744309454e-05,
"loss": 6.5499,
"step": 23000
},
{
"epoch": 9.67741935483871,
"grad_norm": 2.5691514015197754,
"learning_rate": 3.3877251780477585e-05,
"loss": 6.5417,
"step": 23100
},
{
"epoch": 9.719312945119396,
"grad_norm": 2.601832151412964,
"learning_rate": 3.3807429130009776e-05,
"loss": 6.5584,
"step": 23200
},
{
"epoch": 9.761206535400083,
"grad_norm": 3.481239080429077,
"learning_rate": 3.373760647954197e-05,
"loss": 6.5403,
"step": 23300
},
{
"epoch": 9.803100125680771,
"grad_norm": 3.0747485160827637,
"learning_rate": 3.366778382907415e-05,
"loss": 6.5751,
"step": 23400
},
{
"epoch": 9.844993715961458,
"grad_norm": 2.2310988903045654,
"learning_rate": 3.359796117860634e-05,
"loss": 6.5046,
"step": 23500
},
{
"epoch": 9.886887306242144,
"grad_norm": 2.4555273056030273,
"learning_rate": 3.352813852813853e-05,
"loss": 6.5544,
"step": 23600
},
{
"epoch": 9.928780896522833,
"grad_norm": 3.1235666275024414,
"learning_rate": 3.345831587767072e-05,
"loss": 6.5396,
"step": 23700
},
{
"epoch": 9.970674486803519,
"grad_norm": 2.2766611576080322,
"learning_rate": 3.338849322720291e-05,
"loss": 6.5542,
"step": 23800
},
{
"epoch": 10.012568077084206,
"grad_norm": 3.0408995151519775,
"learning_rate": 3.331867057673509e-05,
"loss": 6.4978,
"step": 23900
},
{
"epoch": 10.054461667364894,
"grad_norm": 2.8702831268310547,
"learning_rate": 3.3248847926267283e-05,
"loss": 6.5264,
"step": 24000
},
{
"epoch": 10.09635525764558,
"grad_norm": 2.9117937088012695,
"learning_rate": 3.317902527579947e-05,
"loss": 6.5028,
"step": 24100
},
{
"epoch": 10.138248847926267,
"grad_norm": 2.925631046295166,
"learning_rate": 3.310920262533166e-05,
"loss": 6.5143,
"step": 24200
},
{
"epoch": 10.180142438206955,
"grad_norm": 2.6605536937713623,
"learning_rate": 3.303937997486385e-05,
"loss": 6.5394,
"step": 24300
},
{
"epoch": 10.222036028487642,
"grad_norm": 2.31357479095459,
"learning_rate": 3.2969557324396034e-05,
"loss": 6.5224,
"step": 24400
},
{
"epoch": 10.263929618768328,
"grad_norm": 2.6544747352600098,
"learning_rate": 3.2899734673928225e-05,
"loss": 6.5035,
"step": 24500
},
{
"epoch": 10.305823209049015,
"grad_norm": 2.5945372581481934,
"learning_rate": 3.2830610249965085e-05,
"loss": 6.4977,
"step": 24600
},
{
"epoch": 10.347716799329703,
"grad_norm": 3.120873212814331,
"learning_rate": 3.2760787599497276e-05,
"loss": 6.5399,
"step": 24700
},
{
"epoch": 10.38961038961039,
"grad_norm": 2.705008029937744,
"learning_rate": 3.269096494902947e-05,
"loss": 6.4938,
"step": 24800
},
{
"epoch": 10.431503979891076,
"grad_norm": 2.2395503520965576,
"learning_rate": 3.262114229856166e-05,
"loss": 6.4854,
"step": 24900
},
{
"epoch": 10.473397570171764,
"grad_norm": 2.5891764163970947,
"learning_rate": 3.255131964809384e-05,
"loss": 6.5107,
"step": 25000
},
{
"epoch": 10.51529116045245,
"grad_norm": 3.115931749343872,
"learning_rate": 3.248149699762603e-05,
"loss": 6.5389,
"step": 25100
},
{
"epoch": 10.557184750733137,
"grad_norm": 2.264437675476074,
"learning_rate": 3.2411674347158224e-05,
"loss": 6.51,
"step": 25200
},
{
"epoch": 10.599078341013826,
"grad_norm": 3.449631690979004,
"learning_rate": 3.234185169669041e-05,
"loss": 6.5161,
"step": 25300
},
{
"epoch": 10.640971931294512,
"grad_norm": 2.478337526321411,
"learning_rate": 3.227202904622259e-05,
"loss": 6.5019,
"step": 25400
},
{
"epoch": 10.682865521575199,
"grad_norm": 3.2756478786468506,
"learning_rate": 3.2202206395754784e-05,
"loss": 6.4869,
"step": 25500
},
{
"epoch": 10.724759111855885,
"grad_norm": 2.7576985359191895,
"learning_rate": 3.213238374528697e-05,
"loss": 6.5206,
"step": 25600
},
{
"epoch": 10.766652702136573,
"grad_norm": 2.200963020324707,
"learning_rate": 3.206256109481916e-05,
"loss": 6.48,
"step": 25700
},
{
"epoch": 10.80854629241726,
"grad_norm": 2.7358744144439697,
"learning_rate": 3.199273844435135e-05,
"loss": 6.5126,
"step": 25800
},
{
"epoch": 10.850439882697946,
"grad_norm": 2.7179319858551025,
"learning_rate": 3.192291579388354e-05,
"loss": 6.4699,
"step": 25900
},
{
"epoch": 10.892333472978635,
"grad_norm": 2.811340808868408,
"learning_rate": 3.1853093143415725e-05,
"loss": 6.5056,
"step": 26000
},
{
"epoch": 10.934227063259321,
"grad_norm": 3.010690450668335,
"learning_rate": 3.1783270492947916e-05,
"loss": 6.5103,
"step": 26100
},
{
"epoch": 10.976120653540008,
"grad_norm": 3.213487148284912,
"learning_rate": 3.171344784248011e-05,
"loss": 6.4874,
"step": 26200
},
{
"epoch": 11.018014243820696,
"grad_norm": 2.5710039138793945,
"learning_rate": 3.164362519201229e-05,
"loss": 6.4919,
"step": 26300
},
{
"epoch": 11.059907834101383,
"grad_norm": 2.6933746337890625,
"learning_rate": 3.1573802541544475e-05,
"loss": 6.5284,
"step": 26400
},
{
"epoch": 11.101801424382069,
"grad_norm": 3.775012254714966,
"learning_rate": 3.1503979891076666e-05,
"loss": 6.4894,
"step": 26500
},
{
"epoch": 11.143695014662757,
"grad_norm": 3.2401301860809326,
"learning_rate": 3.1434855467113534e-05,
"loss": 6.4721,
"step": 26600
},
{
"epoch": 11.185588604943444,
"grad_norm": 2.642794132232666,
"learning_rate": 3.1365032816645725e-05,
"loss": 6.4797,
"step": 26700
},
{
"epoch": 11.22748219522413,
"grad_norm": 3.191567897796631,
"learning_rate": 3.129521016617791e-05,
"loss": 6.5022,
"step": 26800
},
{
"epoch": 11.269375785504817,
"grad_norm": 2.816554307937622,
"learning_rate": 3.12253875157101e-05,
"loss": 6.4853,
"step": 26900
},
{
"epoch": 11.311269375785505,
"grad_norm": 2.8666136264801025,
"learning_rate": 3.1155564865242284e-05,
"loss": 6.4839,
"step": 27000
},
{
"epoch": 11.353162966066192,
"grad_norm": 2.9831254482269287,
"learning_rate": 3.1085742214774475e-05,
"loss": 6.5082,
"step": 27100
},
{
"epoch": 11.395056556346878,
"grad_norm": 2.7065083980560303,
"learning_rate": 3.101591956430666e-05,
"loss": 6.4412,
"step": 27200
},
{
"epoch": 11.436950146627566,
"grad_norm": 2.5580694675445557,
"learning_rate": 3.094609691383885e-05,
"loss": 6.4849,
"step": 27300
},
{
"epoch": 11.478843736908253,
"grad_norm": 2.571390390396118,
"learning_rate": 3.087627426337104e-05,
"loss": 6.4689,
"step": 27400
},
{
"epoch": 11.52073732718894,
"grad_norm": 2.835906982421875,
"learning_rate": 3.0806451612903225e-05,
"loss": 6.4887,
"step": 27500
},
{
"epoch": 11.562630917469628,
"grad_norm": 3.1355161666870117,
"learning_rate": 3.0736628962435416e-05,
"loss": 6.4568,
"step": 27600
},
{
"epoch": 11.604524507750314,
"grad_norm": 3.0155599117279053,
"learning_rate": 3.066680631196761e-05,
"loss": 6.4607,
"step": 27700
},
{
"epoch": 11.646418098031,
"grad_norm": 2.6346957683563232,
"learning_rate": 3.059698366149979e-05,
"loss": 6.4706,
"step": 27800
},
{
"epoch": 11.688311688311689,
"grad_norm": 2.4353625774383545,
"learning_rate": 3.052716101103198e-05,
"loss": 6.482,
"step": 27900
},
{
"epoch": 11.730205278592376,
"grad_norm": 3.29835844039917,
"learning_rate": 3.045733836056417e-05,
"loss": 6.4625,
"step": 28000
},
{
"epoch": 11.772098868873062,
"grad_norm": 2.233579158782959,
"learning_rate": 3.038751571009636e-05,
"loss": 6.4727,
"step": 28100
},
{
"epoch": 11.813992459153749,
"grad_norm": 2.5708439350128174,
"learning_rate": 3.0317693059628545e-05,
"loss": 6.4751,
"step": 28200
},
{
"epoch": 11.855886049434437,
"grad_norm": 2.29488205909729,
"learning_rate": 3.0247870409160732e-05,
"loss": 6.4599,
"step": 28300
},
{
"epoch": 11.897779639715123,
"grad_norm": 2.858208179473877,
"learning_rate": 3.0178047758692923e-05,
"loss": 6.469,
"step": 28400
},
{
"epoch": 11.93967322999581,
"grad_norm": 2.854923725128174,
"learning_rate": 3.0108225108225107e-05,
"loss": 6.4995,
"step": 28500
},
{
"epoch": 11.981566820276498,
"grad_norm": 2.590484857559204,
"learning_rate": 3.003910068426198e-05,
"loss": 6.4508,
"step": 28600
},
{
"epoch": 12.023460410557185,
"grad_norm": 3.3479676246643066,
"learning_rate": 2.9969278033794163e-05,
"loss": 6.4581,
"step": 28700
},
{
"epoch": 12.065354000837871,
"grad_norm": 2.7855923175811768,
"learning_rate": 2.9899455383326354e-05,
"loss": 6.4744,
"step": 28800
},
{
"epoch": 12.10724759111856,
"grad_norm": 3.2668962478637695,
"learning_rate": 2.982963273285854e-05,
"loss": 6.4731,
"step": 28900
},
{
"epoch": 12.149141181399246,
"grad_norm": 2.850735664367676,
"learning_rate": 2.9759810082390725e-05,
"loss": 6.4788,
"step": 29000
},
{
"epoch": 12.191034771679933,
"grad_norm": 2.9676952362060547,
"learning_rate": 2.9689987431922916e-05,
"loss": 6.4525,
"step": 29100
},
{
"epoch": 12.23292836196062,
"grad_norm": 2.604408025741577,
"learning_rate": 2.9620164781455107e-05,
"loss": 6.4564,
"step": 29200
},
{
"epoch": 12.274821952241307,
"grad_norm": 2.974653482437134,
"learning_rate": 2.9550342130987295e-05,
"loss": 6.463,
"step": 29300
},
{
"epoch": 12.316715542521994,
"grad_norm": 3.372664213180542,
"learning_rate": 2.9480519480519482e-05,
"loss": 6.464,
"step": 29400
},
{
"epoch": 12.35860913280268,
"grad_norm": 2.6891355514526367,
"learning_rate": 2.941069683005167e-05,
"loss": 6.4674,
"step": 29500
},
{
"epoch": 12.400502723083369,
"grad_norm": 2.964113473892212,
"learning_rate": 2.934087417958386e-05,
"loss": 6.4539,
"step": 29600
},
{
"epoch": 12.442396313364055,
"grad_norm": 2.7328097820281982,
"learning_rate": 2.9271051529116045e-05,
"loss": 6.4224,
"step": 29700
},
{
"epoch": 12.484289903644742,
"grad_norm": 2.6205203533172607,
"learning_rate": 2.9201228878648236e-05,
"loss": 6.4266,
"step": 29800
},
{
"epoch": 12.52618349392543,
"grad_norm": 3.681053400039673,
"learning_rate": 2.9131406228180424e-05,
"loss": 6.4549,
"step": 29900
},
{
"epoch": 12.568077084206116,
"grad_norm": 2.9732627868652344,
"learning_rate": 2.9061583577712608e-05,
"loss": 6.4466,
"step": 30000
},
{
"epoch": 12.609970674486803,
"grad_norm": 3.47816801071167,
"learning_rate": 2.89917609272448e-05,
"loss": 6.4408,
"step": 30100
},
{
"epoch": 12.651864264767491,
"grad_norm": 2.70326566696167,
"learning_rate": 2.892193827677699e-05,
"loss": 6.4444,
"step": 30200
},
{
"epoch": 12.693757855048178,
"grad_norm": 2.9219532012939453,
"learning_rate": 2.8852115626309177e-05,
"loss": 6.4183,
"step": 30300
},
{
"epoch": 12.735651445328864,
"grad_norm": 2.8546571731567383,
"learning_rate": 2.878229297584136e-05,
"loss": 6.4399,
"step": 30400
},
{
"epoch": 12.777545035609553,
"grad_norm": 2.95047926902771,
"learning_rate": 2.8712470325373552e-05,
"loss": 6.4396,
"step": 30500
},
{
"epoch": 12.819438625890239,
"grad_norm": 3.397934675216675,
"learning_rate": 2.8643345901410416e-05,
"loss": 6.438,
"step": 30600
},
{
"epoch": 12.861332216170926,
"grad_norm": 2.625852346420288,
"learning_rate": 2.8573523250942607e-05,
"loss": 6.4363,
"step": 30700
},
{
"epoch": 12.903225806451612,
"grad_norm": 2.5299527645111084,
"learning_rate": 2.85037006004748e-05,
"loss": 6.3952,
"step": 30800
},
{
"epoch": 12.9451193967323,
"grad_norm": 2.6445415019989014,
"learning_rate": 2.8433877950006983e-05,
"loss": 6.4559,
"step": 30900
},
{
"epoch": 12.987012987012987,
"grad_norm": 2.9675769805908203,
"learning_rate": 2.836405529953917e-05,
"loss": 6.4447,
"step": 31000
},
{
"epoch": 13.028906577293673,
"grad_norm": 2.607391119003296,
"learning_rate": 2.829423264907136e-05,
"loss": 6.446,
"step": 31100
},
{
"epoch": 13.070800167574362,
"grad_norm": 3.196765661239624,
"learning_rate": 2.8224409998603545e-05,
"loss": 6.4336,
"step": 31200
},
{
"epoch": 13.112693757855048,
"grad_norm": 5.778535842895508,
"learning_rate": 2.8154587348135736e-05,
"loss": 6.4339,
"step": 31300
},
{
"epoch": 13.154587348135735,
"grad_norm": 3.0479419231414795,
"learning_rate": 2.8084764697667927e-05,
"loss": 6.4147,
"step": 31400
},
{
"epoch": 13.196480938416423,
"grad_norm": 2.6787302494049072,
"learning_rate": 2.8014942047200115e-05,
"loss": 6.4312,
"step": 31500
},
{
"epoch": 13.23837452869711,
"grad_norm": 2.7929670810699463,
"learning_rate": 2.79451193967323e-05,
"loss": 6.4224,
"step": 31600
},
{
"epoch": 13.280268118977796,
"grad_norm": 2.722101926803589,
"learning_rate": 2.787529674626449e-05,
"loss": 6.4247,
"step": 31700
},
{
"epoch": 13.322161709258484,
"grad_norm": 3.295348644256592,
"learning_rate": 2.780547409579668e-05,
"loss": 6.4435,
"step": 31800
},
{
"epoch": 13.36405529953917,
"grad_norm": 2.5780696868896484,
"learning_rate": 2.7735651445328865e-05,
"loss": 6.406,
"step": 31900
},
{
"epoch": 13.405948889819857,
"grad_norm": 2.955299139022827,
"learning_rate": 2.7665828794861053e-05,
"loss": 6.4633,
"step": 32000
},
{
"epoch": 13.447842480100544,
"grad_norm": 3.8027708530426025,
"learning_rate": 2.7596006144393244e-05,
"loss": 6.4445,
"step": 32100
},
{
"epoch": 13.489736070381232,
"grad_norm": 2.6895995140075684,
"learning_rate": 2.7526183493925428e-05,
"loss": 6.4015,
"step": 32200
},
{
"epoch": 13.531629660661919,
"grad_norm": 2.6936516761779785,
"learning_rate": 2.745636084345762e-05,
"loss": 6.4211,
"step": 32300
},
{
"epoch": 13.573523250942605,
"grad_norm": 2.948420763015747,
"learning_rate": 2.738653819298981e-05,
"loss": 6.4042,
"step": 32400
},
{
"epoch": 13.615416841223293,
"grad_norm": 2.763885974884033,
"learning_rate": 2.7316715542521997e-05,
"loss": 6.393,
"step": 32500
},
{
"epoch": 13.65731043150398,
"grad_norm": 3.1601672172546387,
"learning_rate": 2.724759111855886e-05,
"loss": 6.4398,
"step": 32600
},
{
"epoch": 13.699204021784666,
"grad_norm": 2.4161715507507324,
"learning_rate": 2.7177768468091052e-05,
"loss": 6.401,
"step": 32700
},
{
"epoch": 13.741097612065355,
"grad_norm": 3.0796055793762207,
"learning_rate": 2.7107945817623236e-05,
"loss": 6.4265,
"step": 32800
},
{
"epoch": 13.782991202346041,
"grad_norm": 3.6223697662353516,
"learning_rate": 2.7038123167155427e-05,
"loss": 6.4075,
"step": 32900
},
{
"epoch": 13.824884792626728,
"grad_norm": 2.6991615295410156,
"learning_rate": 2.696830051668762e-05,
"loss": 6.3912,
"step": 33000
},
{
"epoch": 13.866778382907416,
"grad_norm": 3.1701860427856445,
"learning_rate": 2.6898477866219803e-05,
"loss": 6.4173,
"step": 33100
},
{
"epoch": 13.908671973188103,
"grad_norm": 2.915432929992676,
"learning_rate": 2.682865521575199e-05,
"loss": 6.4179,
"step": 33200
},
{
"epoch": 13.950565563468789,
"grad_norm": 3.155080795288086,
"learning_rate": 2.675883256528418e-05,
"loss": 6.3895,
"step": 33300
},
{
"epoch": 13.992459153749476,
"grad_norm": 3.3861114978790283,
"learning_rate": 2.6689009914816365e-05,
"loss": 6.4279,
"step": 33400
},
{
"epoch": 14.034352744030164,
"grad_norm": 3.301805019378662,
"learning_rate": 2.6619187264348556e-05,
"loss": 6.4072,
"step": 33500
},
{
"epoch": 14.07624633431085,
"grad_norm": 3.305147171020508,
"learning_rate": 2.6549364613880744e-05,
"loss": 6.3949,
"step": 33600
},
{
"epoch": 14.118139924591537,
"grad_norm": 2.7602477073669434,
"learning_rate": 2.6479541963412935e-05,
"loss": 6.4048,
"step": 33700
},
{
"epoch": 14.160033514872225,
"grad_norm": 2.5257952213287354,
"learning_rate": 2.640971931294512e-05,
"loss": 6.4033,
"step": 33800
},
{
"epoch": 14.201927105152912,
"grad_norm": 2.4649853706359863,
"learning_rate": 2.633989666247731e-05,
"loss": 6.374,
"step": 33900
},
{
"epoch": 14.243820695433598,
"grad_norm": 2.7136335372924805,
"learning_rate": 2.6270074012009497e-05,
"loss": 6.3993,
"step": 34000
},
{
"epoch": 14.285714285714286,
"grad_norm": 2.801712989807129,
"learning_rate": 2.6200251361541685e-05,
"loss": 6.4059,
"step": 34100
},
{
"epoch": 14.327607875994973,
"grad_norm": 2.7054030895233154,
"learning_rate": 2.6130428711073873e-05,
"loss": 6.431,
"step": 34200
},
{
"epoch": 14.36950146627566,
"grad_norm": 2.653932809829712,
"learning_rate": 2.6060606060606063e-05,
"loss": 6.4035,
"step": 34300
},
{
"epoch": 14.411395056556348,
"grad_norm": 2.5450570583343506,
"learning_rate": 2.5990783410138248e-05,
"loss": 6.417,
"step": 34400
},
{
"epoch": 14.453288646837034,
"grad_norm": 2.9578003883361816,
"learning_rate": 2.592096075967044e-05,
"loss": 6.4087,
"step": 34500
},
{
"epoch": 14.49518223711772,
"grad_norm": 2.9408493041992188,
"learning_rate": 2.5851836335707303e-05,
"loss": 6.3936,
"step": 34600
},
{
"epoch": 14.537075827398407,
"grad_norm": 2.756441116333008,
"learning_rate": 2.5782013685239494e-05,
"loss": 6.404,
"step": 34700
},
{
"epoch": 14.578969417679096,
"grad_norm": 3.685004711151123,
"learning_rate": 2.571219103477168e-05,
"loss": 6.3932,
"step": 34800
},
{
"epoch": 14.620863007959782,
"grad_norm": 2.670825719833374,
"learning_rate": 2.5642368384303872e-05,
"loss": 6.3839,
"step": 34900
},
{
"epoch": 14.662756598240469,
"grad_norm": 3.0986082553863525,
"learning_rate": 2.5572545733836056e-05,
"loss": 6.3782,
"step": 35000
},
{
"epoch": 14.704650188521157,
"grad_norm": 3.003432273864746,
"learning_rate": 2.5502723083368247e-05,
"loss": 6.3775,
"step": 35100
},
{
"epoch": 14.746543778801843,
"grad_norm": 2.752516269683838,
"learning_rate": 2.5432900432900435e-05,
"loss": 6.3731,
"step": 35200
},
{
"epoch": 14.78843736908253,
"grad_norm": 2.7697649002075195,
"learning_rate": 2.536307778243262e-05,
"loss": 6.3701,
"step": 35300
},
{
"epoch": 14.830330959363218,
"grad_norm": 3.0245521068573,
"learning_rate": 2.529325513196481e-05,
"loss": 6.3916,
"step": 35400
},
{
"epoch": 14.872224549643905,
"grad_norm": 3.1849350929260254,
"learning_rate": 2.5223432481497e-05,
"loss": 6.3993,
"step": 35500
},
{
"epoch": 14.914118139924591,
"grad_norm": 3.6655123233795166,
"learning_rate": 2.5153609831029185e-05,
"loss": 6.3791,
"step": 35600
},
{
"epoch": 14.95601173020528,
"grad_norm": 3.2252790927886963,
"learning_rate": 2.5083787180561376e-05,
"loss": 6.3865,
"step": 35700
},
{
"epoch": 14.997905320485966,
"grad_norm": 2.8366169929504395,
"learning_rate": 2.5013964530093564e-05,
"loss": 6.3897,
"step": 35800
},
{
"epoch": 15.039798910766653,
"grad_norm": 2.757725715637207,
"learning_rate": 2.494414187962575e-05,
"loss": 6.376,
"step": 35900
},
{
"epoch": 15.081692501047339,
"grad_norm": 3.1640422344207764,
"learning_rate": 2.4874319229157942e-05,
"loss": 6.3796,
"step": 36000
},
{
"epoch": 15.123586091328027,
"grad_norm": 2.849719285964966,
"learning_rate": 2.480449657869013e-05,
"loss": 6.3765,
"step": 36100
},
{
"epoch": 15.165479681608714,
"grad_norm": 2.7223923206329346,
"learning_rate": 2.4734673928222314e-05,
"loss": 6.3953,
"step": 36200
},
{
"epoch": 15.2073732718894,
"grad_norm": 3.173750162124634,
"learning_rate": 2.4664851277754505e-05,
"loss": 6.3724,
"step": 36300
},
{
"epoch": 15.249266862170089,
"grad_norm": 3.054779529571533,
"learning_rate": 2.4595028627286692e-05,
"loss": 6.3764,
"step": 36400
},
{
"epoch": 15.291160452450775,
"grad_norm": 3.277862071990967,
"learning_rate": 2.4525205976818883e-05,
"loss": 6.3583,
"step": 36500
},
{
"epoch": 15.333054042731462,
"grad_norm": 2.9208297729492188,
"learning_rate": 2.4456081552855748e-05,
"loss": 6.3878,
"step": 36600
},
{
"epoch": 15.37494763301215,
"grad_norm": 2.5356411933898926,
"learning_rate": 2.4386258902387935e-05,
"loss": 6.3705,
"step": 36700
},
{
"epoch": 15.416841223292836,
"grad_norm": 2.8953468799591064,
"learning_rate": 2.4316436251920126e-05,
"loss": 6.3947,
"step": 36800
},
{
"epoch": 15.458734813573523,
"grad_norm": 2.9166266918182373,
"learning_rate": 2.424661360145231e-05,
"loss": 6.3809,
"step": 36900
},
{
"epoch": 15.50062840385421,
"grad_norm": 3.4554710388183594,
"learning_rate": 2.41767909509845e-05,
"loss": 6.3746,
"step": 37000
},
{
"epoch": 15.542521994134898,
"grad_norm": 3.7208077907562256,
"learning_rate": 2.410696830051669e-05,
"loss": 6.3758,
"step": 37100
},
{
"epoch": 15.584415584415584,
"grad_norm": 3.3161842823028564,
"learning_rate": 2.4037145650048876e-05,
"loss": 6.3744,
"step": 37200
},
{
"epoch": 15.62630917469627,
"grad_norm": 2.4062047004699707,
"learning_rate": 2.3967322999581064e-05,
"loss": 6.381,
"step": 37300
},
{
"epoch": 15.668202764976959,
"grad_norm": 3.1894476413726807,
"learning_rate": 2.389750034911325e-05,
"loss": 6.3895,
"step": 37400
},
{
"epoch": 15.710096355257646,
"grad_norm": 2.9203104972839355,
"learning_rate": 2.3827677698645442e-05,
"loss": 6.363,
"step": 37500
},
{
"epoch": 15.751989945538332,
"grad_norm": 3.000694513320923,
"learning_rate": 2.375785504817763e-05,
"loss": 6.3837,
"step": 37600
},
{
"epoch": 15.79388353581902,
"grad_norm": 2.838684558868408,
"learning_rate": 2.368803239770982e-05,
"loss": 6.3859,
"step": 37700
},
{
"epoch": 15.835777126099707,
"grad_norm": 2.648862361907959,
"learning_rate": 2.3618209747242005e-05,
"loss": 6.3411,
"step": 37800
},
{
"epoch": 15.877670716380393,
"grad_norm": 3.5438232421875,
"learning_rate": 2.3548387096774193e-05,
"loss": 6.3627,
"step": 37900
},
{
"epoch": 15.91956430666108,
"grad_norm": 2.8182501792907715,
"learning_rate": 2.3478564446306384e-05,
"loss": 6.3731,
"step": 38000
},
{
"epoch": 15.961457896941768,
"grad_norm": 3.3253772258758545,
"learning_rate": 2.340874179583857e-05,
"loss": 6.396,
"step": 38100
},
{
"epoch": 16.003351487222456,
"grad_norm": 3.668926954269409,
"learning_rate": 2.3338919145370762e-05,
"loss": 6.3437,
"step": 38200
},
{
"epoch": 16.045245077503143,
"grad_norm": 3.028989315032959,
"learning_rate": 2.3269096494902946e-05,
"loss": 6.3837,
"step": 38300
},
{
"epoch": 16.08713866778383,
"grad_norm": 3.220702648162842,
"learning_rate": 2.3199273844435134e-05,
"loss": 6.3609,
"step": 38400
},
{
"epoch": 16.129032258064516,
"grad_norm": 3.1788036823272705,
"learning_rate": 2.3129451193967325e-05,
"loss": 6.3723,
"step": 38500
},
{
"epoch": 16.170925848345203,
"grad_norm": 3.351151466369629,
"learning_rate": 2.306032677000419e-05,
"loss": 6.3731,
"step": 38600
},
{
"epoch": 16.21281943862589,
"grad_norm": 2.933992862701416,
"learning_rate": 2.299050411953638e-05,
"loss": 6.3654,
"step": 38700
},
{
"epoch": 16.25471302890658,
"grad_norm": 4.2123589515686035,
"learning_rate": 2.2920681469068568e-05,
"loss": 6.3364,
"step": 38800
},
{
"epoch": 16.296606619187266,
"grad_norm": 2.9287397861480713,
"learning_rate": 2.2850858818600755e-05,
"loss": 6.3643,
"step": 38900
},
{
"epoch": 16.338500209467952,
"grad_norm": 2.6518173217773438,
"learning_rate": 2.2781036168132943e-05,
"loss": 6.3538,
"step": 39000
},
{
"epoch": 16.38039379974864,
"grad_norm": 3.490497589111328,
"learning_rate": 2.271121351766513e-05,
"loss": 6.365,
"step": 39100
},
{
"epoch": 16.422287390029325,
"grad_norm": 3.090874195098877,
"learning_rate": 2.264139086719732e-05,
"loss": 6.3513,
"step": 39200
},
{
"epoch": 16.46418098031001,
"grad_norm": 2.793083429336548,
"learning_rate": 2.257156821672951e-05,
"loss": 6.3815,
"step": 39300
},
{
"epoch": 16.506074570590698,
"grad_norm": 2.656334638595581,
"learning_rate": 2.2501745566261696e-05,
"loss": 6.3677,
"step": 39400
},
{
"epoch": 16.547968160871388,
"grad_norm": 2.950857162475586,
"learning_rate": 2.2431922915793884e-05,
"loss": 6.3601,
"step": 39500
},
{
"epoch": 16.589861751152075,
"grad_norm": 2.948397636413574,
"learning_rate": 2.236210026532607e-05,
"loss": 6.3633,
"step": 39600
},
{
"epoch": 16.63175534143276,
"grad_norm": 3.759934902191162,
"learning_rate": 2.2292277614858262e-05,
"loss": 6.3664,
"step": 39700
},
{
"epoch": 16.673648931713448,
"grad_norm": 2.6607794761657715,
"learning_rate": 2.222245496439045e-05,
"loss": 6.3659,
"step": 39800
},
{
"epoch": 16.715542521994134,
"grad_norm": 3.2569267749786377,
"learning_rate": 2.2152632313922638e-05,
"loss": 6.3477,
"step": 39900
},
{
"epoch": 16.75743611227482,
"grad_norm": 3.1701977252960205,
"learning_rate": 2.2082809663454825e-05,
"loss": 6.3466,
"step": 40000
},
{
"epoch": 16.79932970255551,
"grad_norm": 2.8855369091033936,
"learning_rate": 2.2012987012987013e-05,
"loss": 6.3774,
"step": 40100
},
{
"epoch": 16.841223292836197,
"grad_norm": 2.8468215465545654,
"learning_rate": 2.1943164362519204e-05,
"loss": 6.3388,
"step": 40200
},
{
"epoch": 16.883116883116884,
"grad_norm": 3.3314404487609863,
"learning_rate": 2.187334171205139e-05,
"loss": 6.3658,
"step": 40300
},
{
"epoch": 16.92501047339757,
"grad_norm": 3.023106336593628,
"learning_rate": 2.180351906158358e-05,
"loss": 6.3443,
"step": 40400
},
{
"epoch": 16.966904063678257,
"grad_norm": 3.2845230102539062,
"learning_rate": 2.1733696411115766e-05,
"loss": 6.3785,
"step": 40500
},
{
"epoch": 17.008797653958943,
"grad_norm": 2.805790424346924,
"learning_rate": 2.166457198715263e-05,
"loss": 6.3792,
"step": 40600
},
{
"epoch": 17.05069124423963,
"grad_norm": 2.893737554550171,
"learning_rate": 2.159474933668482e-05,
"loss": 6.3138,
"step": 40700
},
{
"epoch": 17.09258483452032,
"grad_norm": 3.238863945007324,
"learning_rate": 2.1525624912721686e-05,
"loss": 6.3686,
"step": 40800
},
{
"epoch": 17.134478424801006,
"grad_norm": 3.403582811355591,
"learning_rate": 2.1455802262253877e-05,
"loss": 6.3312,
"step": 40900
},
{
"epoch": 17.176372015081693,
"grad_norm": 2.963287353515625,
"learning_rate": 2.1385979611786064e-05,
"loss": 6.3515,
"step": 41000
},
{
"epoch": 17.21826560536238,
"grad_norm": 3.867340087890625,
"learning_rate": 2.1316156961318255e-05,
"loss": 6.3566,
"step": 41100
},
{
"epoch": 17.260159195643066,
"grad_norm": 2.841190814971924,
"learning_rate": 2.124633431085044e-05,
"loss": 6.3308,
"step": 41200
},
{
"epoch": 17.302052785923753,
"grad_norm": 2.872523307800293,
"learning_rate": 2.1176511660382627e-05,
"loss": 6.3433,
"step": 41300
},
{
"epoch": 17.34394637620444,
"grad_norm": 3.156465530395508,
"learning_rate": 2.1106689009914818e-05,
"loss": 6.3779,
"step": 41400
},
{
"epoch": 17.38583996648513,
"grad_norm": 3.5904667377471924,
"learning_rate": 2.1036866359447005e-05,
"loss": 6.3402,
"step": 41500
},
{
"epoch": 17.427733556765816,
"grad_norm": 3.5753939151763916,
"learning_rate": 2.0967043708979196e-05,
"loss": 6.3572,
"step": 41600
},
{
"epoch": 17.469627147046502,
"grad_norm": 3.129514217376709,
"learning_rate": 2.089722105851138e-05,
"loss": 6.3302,
"step": 41700
},
{
"epoch": 17.51152073732719,
"grad_norm": 2.988732099533081,
"learning_rate": 2.0827398408043568e-05,
"loss": 6.3807,
"step": 41800
},
{
"epoch": 17.553414327607875,
"grad_norm": 2.857875108718872,
"learning_rate": 2.075757575757576e-05,
"loss": 6.3519,
"step": 41900
},
{
"epoch": 17.59530791788856,
"grad_norm": 4.023842811584473,
"learning_rate": 2.0687753107107947e-05,
"loss": 6.3467,
"step": 42000
},
{
"epoch": 17.63720150816925,
"grad_norm": 3.049686908721924,
"learning_rate": 2.0617930456640137e-05,
"loss": 6.3306,
"step": 42100
},
{
"epoch": 17.679095098449938,
"grad_norm": 3.3211073875427246,
"learning_rate": 2.054810780617232e-05,
"loss": 6.3611,
"step": 42200
},
{
"epoch": 17.720988688730625,
"grad_norm": 3.064138174057007,
"learning_rate": 2.047828515570451e-05,
"loss": 6.3217,
"step": 42300
},
{
"epoch": 17.76288227901131,
"grad_norm": 2.7812724113464355,
"learning_rate": 2.04084625052367e-05,
"loss": 6.3131,
"step": 42400
},
{
"epoch": 17.804775869291998,
"grad_norm": 2.5516164302825928,
"learning_rate": 2.0338639854768888e-05,
"loss": 6.3428,
"step": 42500
},
{
"epoch": 17.846669459572684,
"grad_norm": 2.9599711894989014,
"learning_rate": 2.026881720430108e-05,
"loss": 6.3545,
"step": 42600
},
{
"epoch": 17.88856304985337,
"grad_norm": 2.8674137592315674,
"learning_rate": 2.0198994553833263e-05,
"loss": 6.3302,
"step": 42700
},
{
"epoch": 17.93045664013406,
"grad_norm": 3.3227078914642334,
"learning_rate": 2.012917190336545e-05,
"loss": 6.3278,
"step": 42800
},
{
"epoch": 17.972350230414747,
"grad_norm": 3.080399751663208,
"learning_rate": 2.005934925289764e-05,
"loss": 6.3206,
"step": 42900
},
{
"epoch": 18.014243820695434,
"grad_norm": 4.004719257354736,
"learning_rate": 1.998952660242983e-05,
"loss": 6.3407,
"step": 43000
},
{
"epoch": 18.05613741097612,
"grad_norm": 2.8186423778533936,
"learning_rate": 1.991970395196202e-05,
"loss": 6.3136,
"step": 43100
},
{
"epoch": 18.098031001256807,
"grad_norm": 2.81748104095459,
"learning_rate": 1.9849881301494204e-05,
"loss": 6.3353,
"step": 43200
},
{
"epoch": 18.139924591537493,
"grad_norm": 2.9991416931152344,
"learning_rate": 1.9780058651026395e-05,
"loss": 6.3194,
"step": 43300
},
{
"epoch": 18.181818181818183,
"grad_norm": 3.4876794815063477,
"learning_rate": 1.9710236000558583e-05,
"loss": 6.3293,
"step": 43400
},
{
"epoch": 18.22371177209887,
"grad_norm": 3.0756711959838867,
"learning_rate": 1.964041335009077e-05,
"loss": 6.341,
"step": 43500
},
{
"epoch": 18.265605362379556,
"grad_norm": 3.171670436859131,
"learning_rate": 1.9570590699622958e-05,
"loss": 6.3075,
"step": 43600
},
{
"epoch": 18.307498952660243,
"grad_norm": 3.3317439556121826,
"learning_rate": 1.9500768049155145e-05,
"loss": 6.3436,
"step": 43700
},
{
"epoch": 18.34939254294093,
"grad_norm": 2.924349308013916,
"learning_rate": 1.9430945398687336e-05,
"loss": 6.3217,
"step": 43800
},
{
"epoch": 18.391286133221616,
"grad_norm": 3.247955560684204,
"learning_rate": 1.9361122748219524e-05,
"loss": 6.3324,
"step": 43900
},
{
"epoch": 18.433179723502302,
"grad_norm": 3.340263843536377,
"learning_rate": 1.929130009775171e-05,
"loss": 6.2993,
"step": 44000
},
{
"epoch": 18.475073313782993,
"grad_norm": 2.973019599914551,
"learning_rate": 1.92214774472839e-05,
"loss": 6.3292,
"step": 44100
},
{
"epoch": 18.51696690406368,
"grad_norm": 3.5055582523345947,
"learning_rate": 1.9151654796816086e-05,
"loss": 6.3175,
"step": 44200
},
{
"epoch": 18.558860494344366,
"grad_norm": 2.9543776512145996,
"learning_rate": 1.9081832146348277e-05,
"loss": 6.3206,
"step": 44300
},
{
"epoch": 18.600754084625052,
"grad_norm": 2.790940284729004,
"learning_rate": 1.9012009495880465e-05,
"loss": 6.3383,
"step": 44400
},
{
"epoch": 18.64264767490574,
"grad_norm": 3.419908285140991,
"learning_rate": 1.8942186845412653e-05,
"loss": 6.3329,
"step": 44500
},
{
"epoch": 18.684541265186425,
"grad_norm": 3.3396215438842773,
"learning_rate": 1.887236419494484e-05,
"loss": 6.312,
"step": 44600
},
{
"epoch": 18.726434855467115,
"grad_norm": 2.6713643074035645,
"learning_rate": 1.8802541544477028e-05,
"loss": 6.315,
"step": 44700
},
{
"epoch": 18.7683284457478,
"grad_norm": 3.2764880657196045,
"learning_rate": 1.8733417120513895e-05,
"loss": 6.3311,
"step": 44800
},
{
"epoch": 18.810222036028488,
"grad_norm": 3.602581739425659,
"learning_rate": 1.8663594470046083e-05,
"loss": 6.327,
"step": 44900
},
{
"epoch": 18.852115626309175,
"grad_norm": 3.052971124649048,
"learning_rate": 1.8593771819578274e-05,
"loss": 6.2911,
"step": 45000
},
{
"epoch": 18.89400921658986,
"grad_norm": 3.0912699699401855,
"learning_rate": 1.852394916911046e-05,
"loss": 6.3057,
"step": 45100
},
{
"epoch": 18.935902806870548,
"grad_norm": 2.631545305252075,
"learning_rate": 1.845412651864265e-05,
"loss": 6.3381,
"step": 45200
},
{
"epoch": 18.977796397151234,
"grad_norm": 3.8213324546813965,
"learning_rate": 1.8384303868174836e-05,
"loss": 6.3123,
"step": 45300
},
{
"epoch": 19.019689987431924,
"grad_norm": 3.3717353343963623,
"learning_rate": 1.8314481217707024e-05,
"loss": 6.3194,
"step": 45400
},
{
"epoch": 19.06158357771261,
"grad_norm": 2.831409215927124,
"learning_rate": 1.8244658567239215e-05,
"loss": 6.3383,
"step": 45500
},
{
"epoch": 19.103477167993297,
"grad_norm": 2.915093183517456,
"learning_rate": 1.8174835916771403e-05,
"loss": 6.3208,
"step": 45600
},
{
"epoch": 19.145370758273984,
"grad_norm": 3.1236917972564697,
"learning_rate": 1.810501326630359e-05,
"loss": 6.3089,
"step": 45700
},
{
"epoch": 19.18726434855467,
"grad_norm": 3.2876298427581787,
"learning_rate": 1.8035190615835778e-05,
"loss": 6.2975,
"step": 45800
},
{
"epoch": 19.229157938835357,
"grad_norm": 2.6437103748321533,
"learning_rate": 1.7965367965367965e-05,
"loss": 6.3341,
"step": 45900
},
{
"epoch": 19.271051529116047,
"grad_norm": 2.9252028465270996,
"learning_rate": 1.7895545314900156e-05,
"loss": 6.3404,
"step": 46000
},
{
"epoch": 19.312945119396733,
"grad_norm": 3.4250340461730957,
"learning_rate": 1.7825722664432344e-05,
"loss": 6.3072,
"step": 46100
},
{
"epoch": 19.35483870967742,
"grad_norm": 3.1287946701049805,
"learning_rate": 1.775590001396453e-05,
"loss": 6.3022,
"step": 46200
},
{
"epoch": 19.396732299958106,
"grad_norm": 3.4577419757843018,
"learning_rate": 1.76867755900014e-05,
"loss": 6.2938,
"step": 46300
},
{
"epoch": 19.438625890238793,
"grad_norm": 3.7131240367889404,
"learning_rate": 1.7616952939533586e-05,
"loss": 6.3088,
"step": 46400
},
{
"epoch": 19.48051948051948,
"grad_norm": 3.6799802780151367,
"learning_rate": 1.7547130289065774e-05,
"loss": 6.3326,
"step": 46500
},
{
"epoch": 19.522413070800166,
"grad_norm": 2.834351062774658,
"learning_rate": 1.747730763859796e-05,
"loss": 6.2952,
"step": 46600
},
{
"epoch": 19.564306661080856,
"grad_norm": 3.0629451274871826,
"learning_rate": 1.7407484988130152e-05,
"loss": 6.3185,
"step": 46700
},
{
"epoch": 19.606200251361543,
"grad_norm": 3.4801712036132812,
"learning_rate": 1.733766233766234e-05,
"loss": 6.3003,
"step": 46800
},
{
"epoch": 19.64809384164223,
"grad_norm": 2.8250389099121094,
"learning_rate": 1.7267839687194524e-05,
"loss": 6.3033,
"step": 46900
},
{
"epoch": 19.689987431922916,
"grad_norm": 3.5964672565460205,
"learning_rate": 1.7198017036726715e-05,
"loss": 6.293,
"step": 47000
},
{
"epoch": 19.731881022203602,
"grad_norm": 2.7947146892547607,
"learning_rate": 1.7128194386258903e-05,
"loss": 6.2884,
"step": 47100
},
{
"epoch": 19.77377461248429,
"grad_norm": 3.0473551750183105,
"learning_rate": 1.7058371735791094e-05,
"loss": 6.312,
"step": 47200
},
{
"epoch": 19.81566820276498,
"grad_norm": 3.1810736656188965,
"learning_rate": 1.698854908532328e-05,
"loss": 6.3102,
"step": 47300
},
{
"epoch": 19.857561793045665,
"grad_norm": 3.0046746730804443,
"learning_rate": 1.6918726434855465e-05,
"loss": 6.3115,
"step": 47400
},
{
"epoch": 19.89945538332635,
"grad_norm": 2.6985220909118652,
"learning_rate": 1.6848903784387656e-05,
"loss": 6.3132,
"step": 47500
},
{
"epoch": 19.941348973607038,
"grad_norm": 2.958906650543213,
"learning_rate": 1.6779081133919844e-05,
"loss": 6.3024,
"step": 47600
},
{
"epoch": 19.983242563887725,
"grad_norm": 3.5484089851379395,
"learning_rate": 1.6709258483452035e-05,
"loss": 6.2989,
"step": 47700
},
{
"epoch": 20.02513615416841,
"grad_norm": 4.328272342681885,
"learning_rate": 1.6639435832984222e-05,
"loss": 6.3162,
"step": 47800
},
{
"epoch": 20.067029744449098,
"grad_norm": 3.0396926403045654,
"learning_rate": 1.6569613182516407e-05,
"loss": 6.3004,
"step": 47900
},
{
"epoch": 20.108923334729788,
"grad_norm": 3.328972339630127,
"learning_rate": 1.6499790532048598e-05,
"loss": 6.2855,
"step": 48000
},
{
"epoch": 20.150816925010474,
"grad_norm": 3.301114320755005,
"learning_rate": 1.6429967881580785e-05,
"loss": 6.2874,
"step": 48100
},
{
"epoch": 20.19271051529116,
"grad_norm": 3.297041177749634,
"learning_rate": 1.6360145231112976e-05,
"loss": 6.3089,
"step": 48200
},
{
"epoch": 20.234604105571847,
"grad_norm": 2.9122605323791504,
"learning_rate": 1.6290322580645164e-05,
"loss": 6.3157,
"step": 48300
},
{
"epoch": 20.276497695852534,
"grad_norm": 2.8182084560394287,
"learning_rate": 1.6220499930177348e-05,
"loss": 6.3118,
"step": 48400
},
{
"epoch": 20.31839128613322,
"grad_norm": 3.8560192584991455,
"learning_rate": 1.615067727970954e-05,
"loss": 6.2858,
"step": 48500
},
{
"epoch": 20.36028487641391,
"grad_norm": 2.457240581512451,
"learning_rate": 1.6080854629241726e-05,
"loss": 6.3077,
"step": 48600
},
{
"epoch": 20.402178466694597,
"grad_norm": 3.5376362800598145,
"learning_rate": 1.6011031978773917e-05,
"loss": 6.2892,
"step": 48700
},
{
"epoch": 20.444072056975283,
"grad_norm": 3.3489222526550293,
"learning_rate": 1.59412093283061e-05,
"loss": 6.2973,
"step": 48800
},
{
"epoch": 20.48596564725597,
"grad_norm": 3.600166082382202,
"learning_rate": 1.587138667783829e-05,
"loss": 6.31,
"step": 48900
},
{
"epoch": 20.527859237536656,
"grad_norm": 3.255598783493042,
"learning_rate": 1.580156402737048e-05,
"loss": 6.2389,
"step": 49000
},
{
"epoch": 20.569752827817343,
"grad_norm": 3.166994094848633,
"learning_rate": 1.5731741376902668e-05,
"loss": 6.303,
"step": 49100
},
{
"epoch": 20.61164641809803,
"grad_norm": 3.615269184112549,
"learning_rate": 1.566191872643486e-05,
"loss": 6.281,
"step": 49200
},
{
"epoch": 20.65354000837872,
"grad_norm": 3.1495063304901123,
"learning_rate": 1.5592096075967043e-05,
"loss": 6.2666,
"step": 49300
},
{
"epoch": 20.695433598659406,
"grad_norm": 2.9170730113983154,
"learning_rate": 1.552227342549923e-05,
"loss": 6.2738,
"step": 49400
},
{
"epoch": 20.737327188940093,
"grad_norm": 3.0922224521636963,
"learning_rate": 1.545245077503142e-05,
"loss": 6.2805,
"step": 49500
},
{
"epoch": 20.77922077922078,
"grad_norm": 3.088012933731079,
"learning_rate": 1.538262812456361e-05,
"loss": 6.2906,
"step": 49600
},
{
"epoch": 20.821114369501466,
"grad_norm": 2.939486503601074,
"learning_rate": 1.53128054740958e-05,
"loss": 6.2636,
"step": 49700
},
{
"epoch": 20.863007959782152,
"grad_norm": 3.597949743270874,
"learning_rate": 1.5242982823627986e-05,
"loss": 6.2745,
"step": 49800
},
{
"epoch": 20.90490155006284,
"grad_norm": 3.4760777950286865,
"learning_rate": 1.5173160173160175e-05,
"loss": 6.2702,
"step": 49900
},
{
"epoch": 20.94679514034353,
"grad_norm": 3.04856014251709,
"learning_rate": 1.5103337522692362e-05,
"loss": 6.2841,
"step": 50000
},
{
"epoch": 20.988688730624215,
"grad_norm": 2.849895477294922,
"learning_rate": 1.503351487222455e-05,
"loss": 6.2814,
"step": 50100
},
{
"epoch": 21.0305823209049,
"grad_norm": 3.1246280670166016,
"learning_rate": 1.496369222175674e-05,
"loss": 6.2754,
"step": 50200
},
{
"epoch": 21.072475911185588,
"grad_norm": 3.303846836090088,
"learning_rate": 1.4894567797793605e-05,
"loss": 6.2661,
"step": 50300
},
{
"epoch": 21.114369501466275,
"grad_norm": 3.5818755626678467,
"learning_rate": 1.4824745147325794e-05,
"loss": 6.2804,
"step": 50400
},
{
"epoch": 21.15626309174696,
"grad_norm": 3.0695786476135254,
"learning_rate": 1.4754922496857982e-05,
"loss": 6.284,
"step": 50500
},
{
"epoch": 21.19815668202765,
"grad_norm": 3.6067614555358887,
"learning_rate": 1.4685099846390168e-05,
"loss": 6.2863,
"step": 50600
},
{
"epoch": 21.240050272308338,
"grad_norm": 3.2230417728424072,
"learning_rate": 1.4615277195922359e-05,
"loss": 6.287,
"step": 50700
},
{
"epoch": 21.281943862589024,
"grad_norm": 3.059466600418091,
"learning_rate": 1.4545454545454545e-05,
"loss": 6.2442,
"step": 50800
},
{
"epoch": 21.32383745286971,
"grad_norm": 3.7770040035247803,
"learning_rate": 1.4475631894986736e-05,
"loss": 6.2612,
"step": 50900
},
{
"epoch": 21.365731043150397,
"grad_norm": 3.3269879817962646,
"learning_rate": 1.4405809244518923e-05,
"loss": 6.2985,
"step": 51000
},
{
"epoch": 21.407624633431084,
"grad_norm": 2.649940252304077,
"learning_rate": 1.4335986594051109e-05,
"loss": 6.2343,
"step": 51100
},
{
"epoch": 21.44951822371177,
"grad_norm": 3.4042983055114746,
"learning_rate": 1.42661639435833e-05,
"loss": 6.2701,
"step": 51200
},
{
"epoch": 21.49141181399246,
"grad_norm": 3.1958000659942627,
"learning_rate": 1.4196341293115486e-05,
"loss": 6.2866,
"step": 51300
},
{
"epoch": 21.533305404273147,
"grad_norm": 3.6010313034057617,
"learning_rate": 1.4126518642647677e-05,
"loss": 6.2683,
"step": 51400
},
{
"epoch": 21.575198994553833,
"grad_norm": 3.429414749145508,
"learning_rate": 1.4056695992179864e-05,
"loss": 6.2408,
"step": 51500
},
{
"epoch": 21.61709258483452,
"grad_norm": 3.069561004638672,
"learning_rate": 1.3986873341712054e-05,
"loss": 6.2641,
"step": 51600
},
{
"epoch": 21.658986175115206,
"grad_norm": 3.575247287750244,
"learning_rate": 1.3917050691244241e-05,
"loss": 6.2722,
"step": 51700
},
{
"epoch": 21.700879765395893,
"grad_norm": 3.033505439758301,
"learning_rate": 1.3847228040776427e-05,
"loss": 6.2424,
"step": 51800
},
{
"epoch": 21.742773355676583,
"grad_norm": 3.287740707397461,
"learning_rate": 1.3777405390308618e-05,
"loss": 6.2516,
"step": 51900
},
{
"epoch": 21.78466694595727,
"grad_norm": 3.0363028049468994,
"learning_rate": 1.3707582739840804e-05,
"loss": 6.2641,
"step": 52000
},
{
"epoch": 21.826560536237956,
"grad_norm": 3.1549689769744873,
"learning_rate": 1.3637760089372995e-05,
"loss": 6.2335,
"step": 52100
},
{
"epoch": 21.868454126518643,
"grad_norm": 3.8512282371520996,
"learning_rate": 1.3567937438905182e-05,
"loss": 6.2729,
"step": 52200
},
{
"epoch": 21.91034771679933,
"grad_norm": 4.0751824378967285,
"learning_rate": 1.3498813014942047e-05,
"loss": 6.2397,
"step": 52300
},
{
"epoch": 21.952241307080016,
"grad_norm": 3.375235080718994,
"learning_rate": 1.3428990364474236e-05,
"loss": 6.2316,
"step": 52400
},
{
"epoch": 21.994134897360702,
"grad_norm": 3.093156337738037,
"learning_rate": 1.3359167714006423e-05,
"loss": 6.2468,
"step": 52500
},
{
"epoch": 22.036028487641392,
"grad_norm": 3.729182243347168,
"learning_rate": 1.3289345063538614e-05,
"loss": 6.2366,
"step": 52600
},
{
"epoch": 22.07792207792208,
"grad_norm": 3.4075732231140137,
"learning_rate": 1.32195224130708e-05,
"loss": 6.2693,
"step": 52700
},
{
"epoch": 22.119815668202765,
"grad_norm": 2.9553005695343018,
"learning_rate": 1.3149699762602988e-05,
"loss": 6.2592,
"step": 52800
},
{
"epoch": 22.16170925848345,
"grad_norm": 3.094538688659668,
"learning_rate": 1.3079877112135177e-05,
"loss": 6.26,
"step": 52900
},
{
"epoch": 22.203602848764138,
"grad_norm": 3.907914161682129,
"learning_rate": 1.3010054461667365e-05,
"loss": 6.2711,
"step": 53000
},
{
"epoch": 22.245496439044825,
"grad_norm": 3.7182159423828125,
"learning_rate": 1.2940231811199554e-05,
"loss": 6.2713,
"step": 53100
},
{
"epoch": 22.287390029325515,
"grad_norm": 2.8652303218841553,
"learning_rate": 1.2870409160731741e-05,
"loss": 6.2325,
"step": 53200
},
{
"epoch": 22.3292836196062,
"grad_norm": 3.190359592437744,
"learning_rate": 1.2800586510263929e-05,
"loss": 6.2563,
"step": 53300
},
{
"epoch": 22.371177209886888,
"grad_norm": 3.372394561767578,
"learning_rate": 1.2730763859796118e-05,
"loss": 6.2489,
"step": 53400
},
{
"epoch": 22.413070800167574,
"grad_norm": 3.340397596359253,
"learning_rate": 1.2660941209328306e-05,
"loss": 6.2147,
"step": 53500
},
{
"epoch": 22.45496439044826,
"grad_norm": 3.1127400398254395,
"learning_rate": 1.2591118558860495e-05,
"loss": 6.2588,
"step": 53600
},
{
"epoch": 22.496857980728947,
"grad_norm": 4.315746307373047,
"learning_rate": 1.2521295908392683e-05,
"loss": 6.2641,
"step": 53700
},
{
"epoch": 22.538751571009634,
"grad_norm": 3.204827070236206,
"learning_rate": 1.2451473257924872e-05,
"loss": 6.2506,
"step": 53800
},
{
"epoch": 22.580645161290324,
"grad_norm": 3.653074026107788,
"learning_rate": 1.238165060745706e-05,
"loss": 6.2512,
"step": 53900
},
{
"epoch": 22.62253875157101,
"grad_norm": 3.8693697452545166,
"learning_rate": 1.2311827956989249e-05,
"loss": 6.2515,
"step": 54000
},
{
"epoch": 22.664432341851697,
"grad_norm": 3.9418985843658447,
"learning_rate": 1.2242005306521436e-05,
"loss": 6.2522,
"step": 54100
},
{
"epoch": 22.706325932132383,
"grad_norm": 3.328951358795166,
"learning_rate": 1.2172182656053624e-05,
"loss": 6.2244,
"step": 54200
},
{
"epoch": 22.74821952241307,
"grad_norm": 3.251552104949951,
"learning_rate": 1.210305823209049e-05,
"loss": 6.2413,
"step": 54300
},
{
"epoch": 22.790113112693756,
"grad_norm": 3.0756313800811768,
"learning_rate": 1.2033235581622679e-05,
"loss": 6.2343,
"step": 54400
},
{
"epoch": 22.832006702974446,
"grad_norm": 3.174830913543701,
"learning_rate": 1.1963412931154867e-05,
"loss": 6.2445,
"step": 54500
},
{
"epoch": 22.873900293255133,
"grad_norm": 2.831454038619995,
"learning_rate": 1.1893590280687056e-05,
"loss": 6.2457,
"step": 54600
},
{
"epoch": 22.91579388353582,
"grad_norm": 3.3783247470855713,
"learning_rate": 1.1823767630219245e-05,
"loss": 6.2202,
"step": 54700
},
{
"epoch": 22.957687473816506,
"grad_norm": 3.4505226612091064,
"learning_rate": 1.1753944979751431e-05,
"loss": 6.2329,
"step": 54800
},
{
"epoch": 22.999581064097192,
"grad_norm": 4.203530311584473,
"learning_rate": 1.168412232928362e-05,
"loss": 6.2464,
"step": 54900
},
{
"epoch": 23.04147465437788,
"grad_norm": 3.295198678970337,
"learning_rate": 1.1614299678815808e-05,
"loss": 6.2163,
"step": 55000
},
{
"epoch": 23.083368244658566,
"grad_norm": 3.6795082092285156,
"learning_rate": 1.1544477028347997e-05,
"loss": 6.2108,
"step": 55100
},
{
"epoch": 23.125261834939256,
"grad_norm": 3.7577404975891113,
"learning_rate": 1.1474654377880186e-05,
"loss": 6.2406,
"step": 55200
},
{
"epoch": 23.167155425219942,
"grad_norm": 4.524641036987305,
"learning_rate": 1.1404831727412372e-05,
"loss": 6.2449,
"step": 55300
},
{
"epoch": 23.20904901550063,
"grad_norm": 3.3049490451812744,
"learning_rate": 1.1335009076944561e-05,
"loss": 6.202,
"step": 55400
},
{
"epoch": 23.250942605781315,
"grad_norm": 3.6244115829467773,
"learning_rate": 1.1265186426476749e-05,
"loss": 6.2214,
"step": 55500
},
{
"epoch": 23.292836196062,
"grad_norm": 3.1158556938171387,
"learning_rate": 1.1195363776008938e-05,
"loss": 6.2247,
"step": 55600
},
{
"epoch": 23.334729786342688,
"grad_norm": 3.208771228790283,
"learning_rate": 1.1125541125541126e-05,
"loss": 6.2416,
"step": 55700
},
{
"epoch": 23.376623376623378,
"grad_norm": 4.181106090545654,
"learning_rate": 1.1055718475073313e-05,
"loss": 6.2343,
"step": 55800
},
{
"epoch": 23.418516966904065,
"grad_norm": 2.8972866535186768,
"learning_rate": 1.0985895824605503e-05,
"loss": 6.2186,
"step": 55900
},
{
"epoch": 23.46041055718475,
"grad_norm": 3.1691384315490723,
"learning_rate": 1.091607317413769e-05,
"loss": 6.2328,
"step": 56000
},
{
"epoch": 23.502304147465438,
"grad_norm": 3.214346408843994,
"learning_rate": 1.084625052366988e-05,
"loss": 6.2356,
"step": 56100
},
{
"epoch": 23.544197737746124,
"grad_norm": 3.0547690391540527,
"learning_rate": 1.0776427873202067e-05,
"loss": 6.2245,
"step": 56200
},
{
"epoch": 23.58609132802681,
"grad_norm": 3.6090760231018066,
"learning_rate": 1.0707303449238935e-05,
"loss": 6.2634,
"step": 56300
},
{
"epoch": 23.627984918307497,
"grad_norm": 3.210068702697754,
"learning_rate": 1.0637480798771122e-05,
"loss": 6.2126,
"step": 56400
},
{
"epoch": 23.669878508588187,
"grad_norm": 3.872507095336914,
"learning_rate": 1.056765814830331e-05,
"loss": 6.2286,
"step": 56500
},
{
"epoch": 23.711772098868874,
"grad_norm": 4.503695011138916,
"learning_rate": 1.0497835497835499e-05,
"loss": 6.2156,
"step": 56600
},
{
"epoch": 23.75366568914956,
"grad_norm": 3.963315486907959,
"learning_rate": 1.0428012847367686e-05,
"loss": 6.2247,
"step": 56700
},
{
"epoch": 23.795559279430247,
"grad_norm": 3.4394917488098145,
"learning_rate": 1.0358190196899876e-05,
"loss": 6.234,
"step": 56800
},
{
"epoch": 23.837452869710933,
"grad_norm": 3.403167724609375,
"learning_rate": 1.0288367546432063e-05,
"loss": 6.2045,
"step": 56900
},
{
"epoch": 23.87934645999162,
"grad_norm": 2.8274378776550293,
"learning_rate": 1.0218544895964251e-05,
"loss": 6.2121,
"step": 57000
},
{
"epoch": 23.92124005027231,
"grad_norm": 3.277188301086426,
"learning_rate": 1.0148722245496438e-05,
"loss": 6.222,
"step": 57100
},
{
"epoch": 23.963133640552996,
"grad_norm": 3.0735063552856445,
"learning_rate": 1.0078899595028628e-05,
"loss": 6.2257,
"step": 57200
},
{
"epoch": 24.005027230833683,
"grad_norm": 3.6680026054382324,
"learning_rate": 1.0009076944560817e-05,
"loss": 6.2131,
"step": 57300
},
{
"epoch": 24.04692082111437,
"grad_norm": 3.134713888168335,
"learning_rate": 9.939254294093005e-06,
"loss": 6.2241,
"step": 57400
},
{
"epoch": 24.088814411395056,
"grad_norm": 2.9466712474823,
"learning_rate": 9.869431643625192e-06,
"loss": 6.2158,
"step": 57500
},
{
"epoch": 24.130708001675742,
"grad_norm": 3.468949794769287,
"learning_rate": 9.79960899315738e-06,
"loss": 6.1793,
"step": 57600
},
{
"epoch": 24.17260159195643,
"grad_norm": 3.5487060546875,
"learning_rate": 9.729786342689569e-06,
"loss": 6.2218,
"step": 57700
},
{
"epoch": 24.21449518223712,
"grad_norm": 4.345893383026123,
"learning_rate": 9.659963692221758e-06,
"loss": 6.2023,
"step": 57800
},
{
"epoch": 24.256388772517806,
"grad_norm": 2.9016401767730713,
"learning_rate": 9.590141041753946e-06,
"loss": 6.23,
"step": 57900
},
{
"epoch": 24.298282362798492,
"grad_norm": 4.17023229598999,
"learning_rate": 9.520318391286135e-06,
"loss": 6.2114,
"step": 58000
},
{
"epoch": 24.34017595307918,
"grad_norm": 3.322115421295166,
"learning_rate": 9.45049574081832e-06,
"loss": 6.204,
"step": 58100
},
{
"epoch": 24.382069543359865,
"grad_norm": 3.709805488586426,
"learning_rate": 9.38067309035051e-06,
"loss": 6.2087,
"step": 58200
},
{
"epoch": 24.42396313364055,
"grad_norm": 3.225588798522949,
"learning_rate": 9.311548666387376e-06,
"loss": 6.2436,
"step": 58300
},
{
"epoch": 24.46585672392124,
"grad_norm": 3.1229472160339355,
"learning_rate": 9.241726015919565e-06,
"loss": 6.2253,
"step": 58400
},
{
"epoch": 24.507750314201928,
"grad_norm": 3.4445230960845947,
"learning_rate": 9.171903365451753e-06,
"loss": 6.2254,
"step": 58500
},
{
"epoch": 24.549643904482615,
"grad_norm": 4.2796807289123535,
"learning_rate": 9.102080714983942e-06,
"loss": 6.221,
"step": 58600
},
{
"epoch": 24.5915374947633,
"grad_norm": 3.2323966026306152,
"learning_rate": 9.03225806451613e-06,
"loss": 6.228,
"step": 58700
},
{
"epoch": 24.633431085043988,
"grad_norm": 4.064596652984619,
"learning_rate": 8.962435414048317e-06,
"loss": 6.2363,
"step": 58800
},
{
"epoch": 24.675324675324674,
"grad_norm": 3.068544864654541,
"learning_rate": 8.893310990085183e-06,
"loss": 6.2508,
"step": 58900
},
{
"epoch": 24.71721826560536,
"grad_norm": 2.6201155185699463,
"learning_rate": 8.823488339617372e-06,
"loss": 6.2193,
"step": 59000
},
{
"epoch": 24.75911185588605,
"grad_norm": 4.960629463195801,
"learning_rate": 8.753665689149562e-06,
"loss": 6.1999,
"step": 59100
},
{
"epoch": 24.801005446166737,
"grad_norm": 3.191586971282959,
"learning_rate": 8.683843038681749e-06,
"loss": 6.2203,
"step": 59200
},
{
"epoch": 24.842899036447424,
"grad_norm": 3.224745512008667,
"learning_rate": 8.614020388213937e-06,
"loss": 6.212,
"step": 59300
},
{
"epoch": 24.88479262672811,
"grad_norm": 3.450741767883301,
"learning_rate": 8.544197737746124e-06,
"loss": 6.2386,
"step": 59400
},
{
"epoch": 24.926686217008797,
"grad_norm": 4.297729969024658,
"learning_rate": 8.474375087278313e-06,
"loss": 6.2088,
"step": 59500
},
{
"epoch": 24.968579807289483,
"grad_norm": 3.376110553741455,
"learning_rate": 8.404552436810503e-06,
"loss": 6.2176,
"step": 59600
},
{
"epoch": 25.010473397570173,
"grad_norm": 3.0211358070373535,
"learning_rate": 8.33472978634269e-06,
"loss": 6.1906,
"step": 59700
},
{
"epoch": 25.05236698785086,
"grad_norm": 2.8490803241729736,
"learning_rate": 8.264907135874878e-06,
"loss": 6.2,
"step": 59800
},
{
"epoch": 25.094260578131546,
"grad_norm": 3.0233705043792725,
"learning_rate": 8.195084485407065e-06,
"loss": 6.1886,
"step": 59900
},
{
"epoch": 25.136154168412233,
"grad_norm": 3.7582995891571045,
"learning_rate": 8.125261834939255e-06,
"loss": 6.2064,
"step": 60000
},
{
"epoch": 25.17804775869292,
"grad_norm": 3.128079891204834,
"learning_rate": 8.055439184471442e-06,
"loss": 6.2264,
"step": 60100
},
{
"epoch": 25.219941348973606,
"grad_norm": 3.1808972358703613,
"learning_rate": 7.985616534003632e-06,
"loss": 6.2149,
"step": 60200
},
{
"epoch": 25.261834939254292,
"grad_norm": 3.2326996326446533,
"learning_rate": 7.91579388353582e-06,
"loss": 6.2142,
"step": 60300
},
{
"epoch": 25.303728529534983,
"grad_norm": 3.267465114593506,
"learning_rate": 7.845971233068007e-06,
"loss": 6.2439,
"step": 60400
},
{
"epoch": 25.34562211981567,
"grad_norm": 3.691075563430786,
"learning_rate": 7.776148582600196e-06,
"loss": 6.2178,
"step": 60500
},
{
"epoch": 25.387515710096356,
"grad_norm": 3.290562152862549,
"learning_rate": 7.706325932132383e-06,
"loss": 6.2165,
"step": 60600
},
{
"epoch": 25.429409300377042,
"grad_norm": 4.553886413574219,
"learning_rate": 7.636503281664573e-06,
"loss": 6.2165,
"step": 60700
},
{
"epoch": 25.47130289065773,
"grad_norm": 4.013444423675537,
"learning_rate": 7.566680631196761e-06,
"loss": 6.2122,
"step": 60800
},
{
"epoch": 25.513196480938415,
"grad_norm": 4.044810771942139,
"learning_rate": 7.496857980728948e-06,
"loss": 6.2533,
"step": 60900
},
{
"epoch": 25.555090071219105,
"grad_norm": 3.788613796234131,
"learning_rate": 7.427035330261137e-06,
"loss": 6.2039,
"step": 61000
},
{
"epoch": 25.59698366149979,
"grad_norm": 3.317281484603882,
"learning_rate": 7.3572126797933255e-06,
"loss": 6.2228,
"step": 61100
},
{
"epoch": 25.638877251780478,
"grad_norm": 3.4238085746765137,
"learning_rate": 7.287390029325514e-06,
"loss": 6.1979,
"step": 61200
},
{
"epoch": 25.680770842061165,
"grad_norm": 3.1558725833892822,
"learning_rate": 7.217567378857702e-06,
"loss": 6.2044,
"step": 61300
},
{
"epoch": 25.72266443234185,
"grad_norm": 2.939328670501709,
"learning_rate": 7.147744728389889e-06,
"loss": 6.2312,
"step": 61400
},
{
"epoch": 25.764558022622538,
"grad_norm": 4.0037455558776855,
"learning_rate": 7.0779220779220775e-06,
"loss": 6.228,
"step": 61500
},
{
"epoch": 25.806451612903224,
"grad_norm": 4.4582343101501465,
"learning_rate": 7.008099427454267e-06,
"loss": 6.2132,
"step": 61600
},
{
"epoch": 25.848345203183914,
"grad_norm": 3.006201982498169,
"learning_rate": 6.938276776986455e-06,
"loss": 6.2242,
"step": 61700
},
{
"epoch": 25.8902387934646,
"grad_norm": 3.6898059844970703,
"learning_rate": 6.8684541265186436e-06,
"loss": 6.2134,
"step": 61800
},
{
"epoch": 25.932132383745287,
"grad_norm": 3.3489785194396973,
"learning_rate": 6.798631476050832e-06,
"loss": 6.2042,
"step": 61900
},
{
"epoch": 25.974025974025974,
"grad_norm": 3.2489922046661377,
"learning_rate": 6.729507052087698e-06,
"loss": 6.2212,
"step": 62000
},
{
"epoch": 26.01591956430666,
"grad_norm": 4.022356033325195,
"learning_rate": 6.659684401619885e-06,
"loss": 6.2423,
"step": 62100
},
{
"epoch": 26.057813154587347,
"grad_norm": 4.803937911987305,
"learning_rate": 6.589861751152074e-06,
"loss": 6.2319,
"step": 62200
},
{
"epoch": 26.099706744868037,
"grad_norm": 3.7283337116241455,
"learning_rate": 6.520039100684262e-06,
"loss": 6.1924,
"step": 62300
},
{
"epoch": 26.141600335148723,
"grad_norm": 3.817946672439575,
"learning_rate": 6.450216450216451e-06,
"loss": 6.2039,
"step": 62400
},
{
"epoch": 26.18349392542941,
"grad_norm": 3.4621963500976562,
"learning_rate": 6.380393799748639e-06,
"loss": 6.214,
"step": 62500
},
{
"epoch": 26.225387515710096,
"grad_norm": 4.458475112915039,
"learning_rate": 6.310571149280827e-06,
"loss": 6.2327,
"step": 62600
},
{
"epoch": 26.267281105990783,
"grad_norm": 3.1324493885040283,
"learning_rate": 6.240748498813015e-06,
"loss": 6.2518,
"step": 62700
},
{
"epoch": 26.30917469627147,
"grad_norm": 3.410626173019409,
"learning_rate": 6.1709258483452034e-06,
"loss": 6.2054,
"step": 62800
},
{
"epoch": 26.351068286552156,
"grad_norm": 3.221602201461792,
"learning_rate": 6.101103197877392e-06,
"loss": 6.2297,
"step": 62900
},
{
"epoch": 26.392961876832846,
"grad_norm": 3.1413893699645996,
"learning_rate": 6.031280547409579e-06,
"loss": 6.2134,
"step": 63000
},
{
"epoch": 26.434855467113533,
"grad_norm": 3.3834433555603027,
"learning_rate": 5.961457896941768e-06,
"loss": 6.167,
"step": 63100
},
{
"epoch": 26.47674905739422,
"grad_norm": 3.016921281814575,
"learning_rate": 5.891635246473957e-06,
"loss": 6.2181,
"step": 63200
},
{
"epoch": 26.518642647674906,
"grad_norm": 3.4190244674682617,
"learning_rate": 5.821812596006145e-06,
"loss": 6.172,
"step": 63300
},
{
"epoch": 26.560536237955592,
"grad_norm": 3.519742488861084,
"learning_rate": 5.751989945538333e-06,
"loss": 6.2144,
"step": 63400
},
{
"epoch": 26.60242982823628,
"grad_norm": 3.083923101425171,
"learning_rate": 5.682167295070521e-06,
"loss": 6.204,
"step": 63500
},
{
"epoch": 26.64432341851697,
"grad_norm": 3.8977878093719482,
"learning_rate": 5.612344644602709e-06,
"loss": 6.1759,
"step": 63600
},
{
"epoch": 26.686217008797655,
"grad_norm": 3.5598249435424805,
"learning_rate": 5.5425219941348974e-06,
"loss": 6.2233,
"step": 63700
},
{
"epoch": 26.72811059907834,
"grad_norm": 3.6333513259887695,
"learning_rate": 5.472699343667086e-06,
"loss": 6.2133,
"step": 63800
},
{
"epoch": 26.770004189359028,
"grad_norm": 3.2468085289001465,
"learning_rate": 5.402876693199274e-06,
"loss": 6.2081,
"step": 63900
},
{
"epoch": 26.811897779639715,
"grad_norm": 3.6896772384643555,
"learning_rate": 5.333054042731463e-06,
"loss": 6.1935,
"step": 64000
},
{
"epoch": 26.8537913699204,
"grad_norm": 3.263144016265869,
"learning_rate": 5.26323139226365e-06,
"loss": 6.2127,
"step": 64100
},
{
"epoch": 26.895684960201088,
"grad_norm": 3.2848362922668457,
"learning_rate": 5.193408741795839e-06,
"loss": 6.2074,
"step": 64200
},
{
"epoch": 26.937578550481778,
"grad_norm": 3.675541639328003,
"learning_rate": 5.123586091328027e-06,
"loss": 6.2015,
"step": 64300
},
{
"epoch": 26.979472140762464,
"grad_norm": 3.413780689239502,
"learning_rate": 5.0537634408602155e-06,
"loss": 6.2218,
"step": 64400
},
{
"epoch": 27.02136573104315,
"grad_norm": 4.108157634735107,
"learning_rate": 4.983940790392404e-06,
"loss": 6.212,
"step": 64500
},
{
"epoch": 27.063259321323837,
"grad_norm": 3.7690155506134033,
"learning_rate": 4.9141181399245915e-06,
"loss": 6.22,
"step": 64600
},
{
"epoch": 27.105152911604524,
"grad_norm": 3.379786491394043,
"learning_rate": 4.84429548945678e-06,
"loss": 6.2334,
"step": 64700
},
{
"epoch": 27.14704650188521,
"grad_norm": 3.5175390243530273,
"learning_rate": 4.774472838988968e-06,
"loss": 6.1932,
"step": 64800
},
{
"epoch": 27.1889400921659,
"grad_norm": 2.8454129695892334,
"learning_rate": 4.704650188521157e-06,
"loss": 6.2087,
"step": 64900
},
{
"epoch": 27.230833682446587,
"grad_norm": 3.4630961418151855,
"learning_rate": 4.634827538053345e-06,
"loss": 6.2142,
"step": 65000
},
{
"epoch": 27.272727272727273,
"grad_norm": 3.339860677719116,
"learning_rate": 4.565004887585533e-06,
"loss": 6.1772,
"step": 65100
},
{
"epoch": 27.31462086300796,
"grad_norm": 3.0743260383605957,
"learning_rate": 4.495182237117721e-06,
"loss": 6.2044,
"step": 65200
},
{
"epoch": 27.356514453288646,
"grad_norm": 3.2576496601104736,
"learning_rate": 4.4253595866499095e-06,
"loss": 6.2191,
"step": 65300
},
{
"epoch": 27.398408043569333,
"grad_norm": 3.326819896697998,
"learning_rate": 4.355536936182097e-06,
"loss": 6.1762,
"step": 65400
},
{
"epoch": 27.44030163385002,
"grad_norm": 3.4447667598724365,
"learning_rate": 4.285714285714286e-06,
"loss": 6.1823,
"step": 65500
},
{
"epoch": 27.48219522413071,
"grad_norm": 3.4771687984466553,
"learning_rate": 4.215891635246475e-06,
"loss": 6.228,
"step": 65600
},
{
"epoch": 27.524088814411396,
"grad_norm": 3.3457424640655518,
"learning_rate": 4.146068984778662e-06,
"loss": 6.1651,
"step": 65700
},
{
"epoch": 27.565982404692082,
"grad_norm": 3.006155490875244,
"learning_rate": 4.076246334310851e-06,
"loss": 6.2026,
"step": 65800
},
{
"epoch": 27.60787599497277,
"grad_norm": 4.228708744049072,
"learning_rate": 4.006423683843038e-06,
"loss": 6.1923,
"step": 65900
},
{
"epoch": 27.649769585253456,
"grad_norm": 3.4744226932525635,
"learning_rate": 3.937299259879905e-06,
"loss": 6.1891,
"step": 66000
},
{
"epoch": 27.691663175534142,
"grad_norm": 3.8300633430480957,
"learning_rate": 3.867476609412093e-06,
"loss": 6.2237,
"step": 66100
},
{
"epoch": 27.733556765814832,
"grad_norm": 2.9689528942108154,
"learning_rate": 3.7976539589442818e-06,
"loss": 6.217,
"step": 66200
},
{
"epoch": 27.77545035609552,
"grad_norm": 3.1309947967529297,
"learning_rate": 3.7278313084764698e-06,
"loss": 6.2061,
"step": 66300
},
{
"epoch": 27.817343946376205,
"grad_norm": 3.4571166038513184,
"learning_rate": 3.658008658008658e-06,
"loss": 6.1863,
"step": 66400
},
{
"epoch": 27.85923753665689,
"grad_norm": 3.354229211807251,
"learning_rate": 3.5881860075408466e-06,
"loss": 6.1996,
"step": 66500
},
{
"epoch": 27.901131126937578,
"grad_norm": 3.745568037033081,
"learning_rate": 3.5183633570730346e-06,
"loss": 6.1839,
"step": 66600
},
{
"epoch": 27.943024717218265,
"grad_norm": 3.356715440750122,
"learning_rate": 3.448540706605223e-06,
"loss": 6.2048,
"step": 66700
},
{
"epoch": 27.98491830749895,
"grad_norm": 2.964492082595825,
"learning_rate": 3.378718056137411e-06,
"loss": 6.228,
"step": 66800
},
{
"epoch": 28.02681189777964,
"grad_norm": 3.336606502532959,
"learning_rate": 3.3088954056695994e-06,
"loss": 6.1953,
"step": 66900
},
{
"epoch": 28.068705488060328,
"grad_norm": 3.264971971511841,
"learning_rate": 3.239072755201788e-06,
"loss": 6.1783,
"step": 67000
},
{
"epoch": 28.110599078341014,
"grad_norm": 3.4968082904815674,
"learning_rate": 3.169250104733976e-06,
"loss": 6.2117,
"step": 67100
},
{
"epoch": 28.1524926686217,
"grad_norm": 3.4082252979278564,
"learning_rate": 3.099427454266164e-06,
"loss": 6.2278,
"step": 67200
},
{
"epoch": 28.194386258902387,
"grad_norm": 3.52056884765625,
"learning_rate": 3.029604803798352e-06,
"loss": 6.2037,
"step": 67300
},
{
"epoch": 28.236279849183074,
"grad_norm": 3.6062779426574707,
"learning_rate": 2.9597821533305406e-06,
"loss": 6.1952,
"step": 67400
},
{
"epoch": 28.278173439463764,
"grad_norm": 3.158705472946167,
"learning_rate": 2.889959502862729e-06,
"loss": 6.2067,
"step": 67500
},
{
"epoch": 28.32006702974445,
"grad_norm": 3.6732075214385986,
"learning_rate": 2.820136852394917e-06,
"loss": 6.1752,
"step": 67600
},
{
"epoch": 28.361960620025137,
"grad_norm": 2.842560291290283,
"learning_rate": 2.7503142019271054e-06,
"loss": 6.1823,
"step": 67700
},
{
"epoch": 28.403854210305823,
"grad_norm": 3.412233591079712,
"learning_rate": 2.6804915514592934e-06,
"loss": 6.1997,
"step": 67800
},
{
"epoch": 28.44574780058651,
"grad_norm": 2.8313143253326416,
"learning_rate": 2.610668900991482e-06,
"loss": 6.187,
"step": 67900
},
{
"epoch": 28.487641390867196,
"grad_norm": 3.122307300567627,
"learning_rate": 2.541544477028348e-06,
"loss": 6.1977,
"step": 68000
},
{
"epoch": 28.529534981147883,
"grad_norm": 3.4732697010040283,
"learning_rate": 2.4717218265605365e-06,
"loss": 6.1913,
"step": 68100
},
{
"epoch": 28.571428571428573,
"grad_norm": 3.3936917781829834,
"learning_rate": 2.4018991760927245e-06,
"loss": 6.2216,
"step": 68200
},
{
"epoch": 28.61332216170926,
"grad_norm": 3.2980170249938965,
"learning_rate": 2.332076525624913e-06,
"loss": 6.1989,
"step": 68300
},
{
"epoch": 28.655215751989946,
"grad_norm": 4.099823951721191,
"learning_rate": 2.2622538751571013e-06,
"loss": 6.1755,
"step": 68400
},
{
"epoch": 28.697109342270632,
"grad_norm": 3.7930960655212402,
"learning_rate": 2.1924312246892893e-06,
"loss": 6.2155,
"step": 68500
},
{
"epoch": 28.73900293255132,
"grad_norm": 3.620065212249756,
"learning_rate": 2.1226085742214777e-06,
"loss": 6.2041,
"step": 68600
},
{
"epoch": 28.780896522832006,
"grad_norm": 3.3451735973358154,
"learning_rate": 2.0527859237536657e-06,
"loss": 6.2095,
"step": 68700
},
{
"epoch": 28.822790113112696,
"grad_norm": 4.007857799530029,
"learning_rate": 1.982963273285854e-06,
"loss": 6.2283,
"step": 68800
},
{
"epoch": 28.864683703393382,
"grad_norm": 4.236888885498047,
"learning_rate": 1.9131406228180425e-06,
"loss": 6.1799,
"step": 68900
},
{
"epoch": 28.90657729367407,
"grad_norm": 3.222273111343384,
"learning_rate": 1.8433179723502305e-06,
"loss": 6.2008,
"step": 69000
},
{
"epoch": 28.948470883954755,
"grad_norm": 3.8649580478668213,
"learning_rate": 1.7734953218824187e-06,
"loss": 6.194,
"step": 69100
},
{
"epoch": 28.99036447423544,
"grad_norm": 3.9630191326141357,
"learning_rate": 1.7036726714146071e-06,
"loss": 6.2083,
"step": 69200
},
{
"epoch": 29.032258064516128,
"grad_norm": 3.9617035388946533,
"learning_rate": 1.6338500209467953e-06,
"loss": 6.2197,
"step": 69300
},
{
"epoch": 29.074151654796815,
"grad_norm": 3.4647514820098877,
"learning_rate": 1.5640273704789835e-06,
"loss": 6.1919,
"step": 69400
},
{
"epoch": 29.116045245077505,
"grad_norm": 3.7548468112945557,
"learning_rate": 1.4942047200111717e-06,
"loss": 6.171,
"step": 69500
},
{
"epoch": 29.15793883535819,
"grad_norm": 3.4267735481262207,
"learning_rate": 1.42438206954336e-06,
"loss": 6.2031,
"step": 69600
},
{
"epoch": 29.199832425638878,
"grad_norm": 3.166888952255249,
"learning_rate": 1.3545594190755483e-06,
"loss": 6.1937,
"step": 69700
},
{
"epoch": 29.241726015919564,
"grad_norm": 2.9794344902038574,
"learning_rate": 1.2847367686077363e-06,
"loss": 6.2068,
"step": 69800
},
{
"epoch": 29.28361960620025,
"grad_norm": 3.056293249130249,
"learning_rate": 1.2149141181399247e-06,
"loss": 6.1652,
"step": 69900
},
{
"epoch": 29.325513196480937,
"grad_norm": 3.851149320602417,
"learning_rate": 1.145789694176791e-06,
"loss": 6.2129,
"step": 70000
},
{
"epoch": 29.367406786761624,
"grad_norm": 3.670929193496704,
"learning_rate": 1.0759670437089792e-06,
"loss": 6.2068,
"step": 70100
},
{
"epoch": 29.409300377042314,
"grad_norm": 3.3581252098083496,
"learning_rate": 1.0061443932411674e-06,
"loss": 6.2208,
"step": 70200
},
{
"epoch": 29.451193967323,
"grad_norm": 3.7551257610321045,
"learning_rate": 9.363217427733557e-07,
"loss": 6.179,
"step": 70300
},
{
"epoch": 29.493087557603687,
"grad_norm": 2.9767682552337646,
"learning_rate": 8.664990923055439e-07,
"loss": 6.2157,
"step": 70400
},
{
"epoch": 29.534981147884373,
"grad_norm": 3.3218774795532227,
"learning_rate": 7.966764418377322e-07,
"loss": 6.1773,
"step": 70500
},
{
"epoch": 29.57687473816506,
"grad_norm": 4.360437870025635,
"learning_rate": 7.268537913699204e-07,
"loss": 6.1985,
"step": 70600
},
{
"epoch": 29.618768328445746,
"grad_norm": 3.544264078140259,
"learning_rate": 6.570311409021087e-07,
"loss": 6.1973,
"step": 70700
},
{
"epoch": 29.660661918726436,
"grad_norm": 3.7416069507598877,
"learning_rate": 5.872084904342969e-07,
"loss": 6.2031,
"step": 70800
},
{
"epoch": 29.702555509007123,
"grad_norm": 3.0346035957336426,
"learning_rate": 5.173858399664851e-07,
"loss": 6.2123,
"step": 70900
},
{
"epoch": 29.74444909928781,
"grad_norm": 3.2308425903320312,
"learning_rate": 4.4756318949867344e-07,
"loss": 6.2106,
"step": 71000
},
{
"epoch": 29.786342689568496,
"grad_norm": 3.0109570026397705,
"learning_rate": 3.7774053903086163e-07,
"loss": 6.213,
"step": 71100
},
{
"epoch": 29.828236279849182,
"grad_norm": 3.733609199523926,
"learning_rate": 3.079178885630499e-07,
"loss": 6.1984,
"step": 71200
},
{
"epoch": 29.87012987012987,
"grad_norm": 3.5430541038513184,
"learning_rate": 2.3809523809523814e-07,
"loss": 6.183,
"step": 71300
},
{
"epoch": 29.912023460410555,
"grad_norm": 3.1964950561523438,
"learning_rate": 1.6827258762742634e-07,
"loss": 6.1817,
"step": 71400
},
{
"epoch": 29.953917050691246,
"grad_norm": 3.6197755336761475,
"learning_rate": 9.844993715961458e-08,
"loss": 6.1907,
"step": 71500
},
{
"epoch": 29.995810640971932,
"grad_norm": 3.035473346710205,
"learning_rate": 2.862728669180282e-08,
"loss": 6.1677,
"step": 71600
},
{
"epoch": 30.0,
"step": 71610,
"total_flos": 781486986700800.0,
"train_loss": 6.528281962779993,
"train_runtime": 3948.1737,
"train_samples_per_second": 580.354,
"train_steps_per_second": 18.137
}
],
"logging_steps": 100,
"max_steps": 71610,
"num_input_tokens_seen": 0,
"num_train_epochs": 30,
"save_steps": 5000,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 781486986700800.0,
"train_batch_size": 32,
"trial_name": null,
"trial_params": null
}