llava-llama3.1-8b / trainer_state.json
DaozeZhang
add ckpt
a0a784e
{
"best_metric": 0.79888368,
"best_model_checkpoint": "/mnt/nas1/daoze/code/swift/output/llava1_6-llama3_1-8b-instruct-my/v33-20240901-191352/checkpoint-4200",
"epoch": 1.9995860070378804,
"eval_steps": 300,
"global_step": 4830,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.00041399296211964395,
"grad_norm": 68.31517175098004,
"learning_rate": 0.0,
"loss": 7.09566689,
"memory(GiB)": 42.23,
"step": 1,
"train_speed(iter/s)": 0.013537
},
{
"epoch": 0.0020699648105982197,
"grad_norm": 75.27486731705963,
"learning_rate": 3.2339240870284233e-06,
"loss": 4.4965229,
"memory(GiB)": 48.78,
"step": 5,
"train_speed(iter/s)": 0.018721
},
{
"epoch": 0.004139929621196439,
"grad_norm": 7.7589717582741615,
"learning_rate": 4.626699381900465e-06,
"loss": 1.79170246,
"memory(GiB)": 52.1,
"step": 10,
"train_speed(iter/s)": 0.01954
},
{
"epoch": 0.00620989443179466,
"grad_norm": 4.8519342782653485,
"learning_rate": 5.44142070133146e-06,
"loss": 1.36023655,
"memory(GiB)": 52.1,
"step": 15,
"train_speed(iter/s)": 0.019784
},
{
"epoch": 0.008279859242392879,
"grad_norm": 3.375818313593022,
"learning_rate": 6.0194746767725065e-06,
"loss": 1.18811779,
"memory(GiB)": 52.1,
"step": 20,
"train_speed(iter/s)": 0.020008
},
{
"epoch": 0.0103498240529911,
"grad_norm": 3.5284378077706813,
"learning_rate": 6.467848174056847e-06,
"loss": 1.1600296,
"memory(GiB)": 52.1,
"step": 25,
"train_speed(iter/s)": 0.020192
},
{
"epoch": 0.01241978886358932,
"grad_norm": 3.163950589670924,
"learning_rate": 6.834195996203502e-06,
"loss": 1.11185207,
"memory(GiB)": 52.1,
"step": 30,
"train_speed(iter/s)": 0.020268
},
{
"epoch": 0.014489753674187538,
"grad_norm": 2.212271768126557,
"learning_rate": 7.143938666407679e-06,
"loss": 1.0538393,
"memory(GiB)": 52.1,
"step": 35,
"train_speed(iter/s)": 0.020328
},
{
"epoch": 0.016559718484785758,
"grad_norm": 3.493503288451801,
"learning_rate": 7.412249971644547e-06,
"loss": 1.04750004,
"memory(GiB)": 52.1,
"step": 40,
"train_speed(iter/s)": 0.020393
},
{
"epoch": 0.01862968329538398,
"grad_norm": 2.6291679253427644,
"learning_rate": 7.648917315634497e-06,
"loss": 1.04756851,
"memory(GiB)": 52.1,
"step": 45,
"train_speed(iter/s)": 0.020444
},
{
"epoch": 0.0206996481059822,
"grad_norm": 3.1377476353112312,
"learning_rate": 7.860623468928888e-06,
"loss": 1.00407467,
"memory(GiB)": 52.1,
"step": 50,
"train_speed(iter/s)": 0.02043
},
{
"epoch": 0.022769612916580417,
"grad_norm": 2.6547100112946733,
"learning_rate": 8.05213497976565e-06,
"loss": 1.01097145,
"memory(GiB)": 52.1,
"step": 55,
"train_speed(iter/s)": 0.020407
},
{
"epoch": 0.02483957772717864,
"grad_norm": 2.6383576740248555,
"learning_rate": 8.226971291075542e-06,
"loss": 0.98796005,
"memory(GiB)": 52.1,
"step": 60,
"train_speed(iter/s)": 0.020419
},
{
"epoch": 0.02690954253777686,
"grad_norm": 2.542613928151568,
"learning_rate": 8.387805106618597e-06,
"loss": 0.98850031,
"memory(GiB)": 52.1,
"step": 65,
"train_speed(iter/s)": 0.020424
},
{
"epoch": 0.028979507348375077,
"grad_norm": 2.877488922903144,
"learning_rate": 8.536713961279723e-06,
"loss": 0.98919926,
"memory(GiB)": 52.1,
"step": 70,
"train_speed(iter/s)": 0.020439
},
{
"epoch": 0.031049472158973298,
"grad_norm": 1.8237010588157114,
"learning_rate": 8.675344788359883e-06,
"loss": 1.012813,
"memory(GiB)": 52.1,
"step": 75,
"train_speed(iter/s)": 0.020466
},
{
"epoch": 0.033119436969571515,
"grad_norm": 2.5261639534973788,
"learning_rate": 8.805025266516589e-06,
"loss": 0.98800411,
"memory(GiB)": 52.1,
"step": 80,
"train_speed(iter/s)": 0.02047
},
{
"epoch": 0.035189401780169736,
"grad_norm": 2.2711243603226032,
"learning_rate": 8.926841351029377e-06,
"loss": 1.00139637,
"memory(GiB)": 55.52,
"step": 85,
"train_speed(iter/s)": 0.020479
},
{
"epoch": 0.03725936659076796,
"grad_norm": 2.145948654847171,
"learning_rate": 9.041692610506539e-06,
"loss": 0.99127426,
"memory(GiB)": 55.52,
"step": 90,
"train_speed(iter/s)": 0.020491
},
{
"epoch": 0.03932933140136618,
"grad_norm": 2.836740011107602,
"learning_rate": 9.150332582159872e-06,
"loss": 0.96552677,
"memory(GiB)": 55.52,
"step": 95,
"train_speed(iter/s)": 0.020506
},
{
"epoch": 0.0413992962119644,
"grad_norm": 2.2398611362766006,
"learning_rate": 9.25339876380093e-06,
"loss": 0.98038893,
"memory(GiB)": 55.52,
"step": 100,
"train_speed(iter/s)": 0.020501
},
{
"epoch": 0.043469261022562614,
"grad_norm": 2.4986121497854192,
"learning_rate": 9.351435280710716e-06,
"loss": 0.9604641,
"memory(GiB)": 63.35,
"step": 105,
"train_speed(iter/s)": 0.020504
},
{
"epoch": 0.045539225833160835,
"grad_norm": 2.1909523678523755,
"learning_rate": 9.444910274637691e-06,
"loss": 0.93479166,
"memory(GiB)": 63.35,
"step": 110,
"train_speed(iter/s)": 0.020519
},
{
"epoch": 0.047609190643759056,
"grad_norm": 2.5301628247983663,
"learning_rate": 9.534229424247679e-06,
"loss": 0.97118149,
"memory(GiB)": 63.35,
"step": 115,
"train_speed(iter/s)": 0.020522
},
{
"epoch": 0.04967915545435728,
"grad_norm": 1.92102667189958,
"learning_rate": 9.619746585947584e-06,
"loss": 0.97038708,
"memory(GiB)": 63.35,
"step": 120,
"train_speed(iter/s)": 0.020545
},
{
"epoch": 0.0517491202649555,
"grad_norm": 2.2968578337197036,
"learning_rate": 9.701772261085271e-06,
"loss": 0.96241703,
"memory(GiB)": 63.35,
"step": 125,
"train_speed(iter/s)": 0.020544
},
{
"epoch": 0.05381908507555372,
"grad_norm": 2.110866882726078,
"learning_rate": 9.780580401490638e-06,
"loss": 0.95122089,
"memory(GiB)": 63.35,
"step": 130,
"train_speed(iter/s)": 0.020551
},
{
"epoch": 0.05588904988615193,
"grad_norm": 2.082289679777962,
"learning_rate": 9.856413929937534e-06,
"loss": 0.95173302,
"memory(GiB)": 63.35,
"step": 135,
"train_speed(iter/s)": 0.020557
},
{
"epoch": 0.057959014696750154,
"grad_norm": 2.6318522567639784,
"learning_rate": 9.929489256151762e-06,
"loss": 0.94850836,
"memory(GiB)": 63.35,
"step": 140,
"train_speed(iter/s)": 0.020559
},
{
"epoch": 0.060028979507348375,
"grad_norm": 2.091985474615185,
"learning_rate": 1e-05,
"loss": 0.91576376,
"memory(GiB)": 63.35,
"step": 145,
"train_speed(iter/s)": 0.020557
},
{
"epoch": 0.062098944317946596,
"grad_norm": 2.3841665763842705,
"learning_rate": 9.991462113127002e-06,
"loss": 0.93616524,
"memory(GiB)": 63.35,
"step": 150,
"train_speed(iter/s)": 0.020563
},
{
"epoch": 0.06416890912854481,
"grad_norm": 2.1042524174231505,
"learning_rate": 9.980789754535753e-06,
"loss": 0.9277298,
"memory(GiB)": 63.35,
"step": 155,
"train_speed(iter/s)": 0.020558
},
{
"epoch": 0.06623887393914303,
"grad_norm": 2.276077986025351,
"learning_rate": 9.970117395944504e-06,
"loss": 0.95900288,
"memory(GiB)": 63.35,
"step": 160,
"train_speed(iter/s)": 0.020556
},
{
"epoch": 0.06830883874974125,
"grad_norm": 2.2459536201915387,
"learning_rate": 9.959445037353256e-06,
"loss": 0.94394236,
"memory(GiB)": 63.35,
"step": 165,
"train_speed(iter/s)": 0.020552
},
{
"epoch": 0.07037880356033947,
"grad_norm": 2.517051668755459,
"learning_rate": 9.948772678762007e-06,
"loss": 0.94310379,
"memory(GiB)": 63.35,
"step": 170,
"train_speed(iter/s)": 0.020552
},
{
"epoch": 0.0724487683709377,
"grad_norm": 2.2026553318124313,
"learning_rate": 9.938100320170759e-06,
"loss": 0.95436573,
"memory(GiB)": 63.35,
"step": 175,
"train_speed(iter/s)": 0.020554
},
{
"epoch": 0.07451873318153591,
"grad_norm": 2.2476130486911465,
"learning_rate": 9.92742796157951e-06,
"loss": 0.93582458,
"memory(GiB)": 63.35,
"step": 180,
"train_speed(iter/s)": 0.020553
},
{
"epoch": 0.07658869799213414,
"grad_norm": 2.388512026493847,
"learning_rate": 9.916755602988262e-06,
"loss": 0.91813745,
"memory(GiB)": 63.35,
"step": 185,
"train_speed(iter/s)": 0.020558
},
{
"epoch": 0.07865866280273236,
"grad_norm": 2.3220003726970204,
"learning_rate": 9.906083244397012e-06,
"loss": 0.92115765,
"memory(GiB)": 63.35,
"step": 190,
"train_speed(iter/s)": 0.020561
},
{
"epoch": 0.08072862761333058,
"grad_norm": 2.3768930570990805,
"learning_rate": 9.895410885805764e-06,
"loss": 0.91676846,
"memory(GiB)": 63.35,
"step": 195,
"train_speed(iter/s)": 0.020562
},
{
"epoch": 0.0827985924239288,
"grad_norm": 4.2054479128966165,
"learning_rate": 9.884738527214515e-06,
"loss": 0.91852398,
"memory(GiB)": 63.35,
"step": 200,
"train_speed(iter/s)": 0.020566
},
{
"epoch": 0.084868557234527,
"grad_norm": 2.14133807915419,
"learning_rate": 9.874066168623266e-06,
"loss": 0.91718044,
"memory(GiB)": 63.35,
"step": 205,
"train_speed(iter/s)": 0.020563
},
{
"epoch": 0.08693852204512523,
"grad_norm": 1.9390816379786193,
"learning_rate": 9.863393810032017e-06,
"loss": 0.92340775,
"memory(GiB)": 63.35,
"step": 210,
"train_speed(iter/s)": 0.020559
},
{
"epoch": 0.08900848685572345,
"grad_norm": 2.1261095243289208,
"learning_rate": 9.852721451440769e-06,
"loss": 0.90159931,
"memory(GiB)": 63.35,
"step": 215,
"train_speed(iter/s)": 0.020561
},
{
"epoch": 0.09107845166632167,
"grad_norm": 2.9284829291227243,
"learning_rate": 9.842049092849521e-06,
"loss": 0.91123104,
"memory(GiB)": 63.35,
"step": 220,
"train_speed(iter/s)": 0.020557
},
{
"epoch": 0.09314841647691989,
"grad_norm": 2.7049739018013583,
"learning_rate": 9.831376734258272e-06,
"loss": 0.941084,
"memory(GiB)": 63.35,
"step": 225,
"train_speed(iter/s)": 0.02056
},
{
"epoch": 0.09521838128751811,
"grad_norm": 2.0467249315549845,
"learning_rate": 9.820704375667023e-06,
"loss": 0.90574436,
"memory(GiB)": 63.35,
"step": 230,
"train_speed(iter/s)": 0.020561
},
{
"epoch": 0.09728834609811633,
"grad_norm": 2.1061950654127006,
"learning_rate": 9.810032017075774e-06,
"loss": 0.92933855,
"memory(GiB)": 63.35,
"step": 235,
"train_speed(iter/s)": 0.020568
},
{
"epoch": 0.09935831090871455,
"grad_norm": 2.390745868374031,
"learning_rate": 9.799359658484527e-06,
"loss": 0.9383173,
"memory(GiB)": 63.35,
"step": 240,
"train_speed(iter/s)": 0.02057
},
{
"epoch": 0.10142827571931277,
"grad_norm": 1.9948725536279355,
"learning_rate": 9.788687299893276e-06,
"loss": 0.85975437,
"memory(GiB)": 63.35,
"step": 245,
"train_speed(iter/s)": 0.020577
},
{
"epoch": 0.103498240529911,
"grad_norm": 2.29936461247775,
"learning_rate": 9.77801494130203e-06,
"loss": 0.90683613,
"memory(GiB)": 63.47,
"step": 250,
"train_speed(iter/s)": 0.020575
},
{
"epoch": 0.10556820534050922,
"grad_norm": 1.8961456174283475,
"learning_rate": 9.76734258271078e-06,
"loss": 0.88947477,
"memory(GiB)": 63.47,
"step": 255,
"train_speed(iter/s)": 0.020574
},
{
"epoch": 0.10763817015110744,
"grad_norm": 1.8558977060217532,
"learning_rate": 9.756670224119531e-06,
"loss": 0.90292645,
"memory(GiB)": 63.47,
"step": 260,
"train_speed(iter/s)": 0.020581
},
{
"epoch": 0.10970813496170564,
"grad_norm": 2.1017797656853725,
"learning_rate": 9.745997865528282e-06,
"loss": 0.92776756,
"memory(GiB)": 63.47,
"step": 265,
"train_speed(iter/s)": 0.020579
},
{
"epoch": 0.11177809977230387,
"grad_norm": 1.9051226867861688,
"learning_rate": 9.735325506937033e-06,
"loss": 0.90565796,
"memory(GiB)": 63.59,
"step": 270,
"train_speed(iter/s)": 0.020578
},
{
"epoch": 0.11384806458290209,
"grad_norm": 1.932217719000402,
"learning_rate": 9.724653148345784e-06,
"loss": 0.90403481,
"memory(GiB)": 63.59,
"step": 275,
"train_speed(iter/s)": 0.020575
},
{
"epoch": 0.11591802939350031,
"grad_norm": 3.033913485092789,
"learning_rate": 9.713980789754537e-06,
"loss": 0.86916351,
"memory(GiB)": 63.59,
"step": 280,
"train_speed(iter/s)": 0.020575
},
{
"epoch": 0.11798799420409853,
"grad_norm": 1.9434774416915237,
"learning_rate": 9.703308431163288e-06,
"loss": 0.87491503,
"memory(GiB)": 63.59,
"step": 285,
"train_speed(iter/s)": 0.020569
},
{
"epoch": 0.12005795901469675,
"grad_norm": 2.051766135268311,
"learning_rate": 9.69263607257204e-06,
"loss": 0.89817352,
"memory(GiB)": 63.59,
"step": 290,
"train_speed(iter/s)": 0.020572
},
{
"epoch": 0.12212792382529497,
"grad_norm": 2.2615047465793796,
"learning_rate": 9.68196371398079e-06,
"loss": 0.89681797,
"memory(GiB)": 63.59,
"step": 295,
"train_speed(iter/s)": 0.020574
},
{
"epoch": 0.12419788863589319,
"grad_norm": 2.3348971307519637,
"learning_rate": 9.671291355389541e-06,
"loss": 0.91658554,
"memory(GiB)": 63.59,
"step": 300,
"train_speed(iter/s)": 0.020576
},
{
"epoch": 0.12419788863589319,
"eval_loss": 0.9112715721130371,
"eval_runtime": 338.0523,
"eval_samples_per_second": 18.476,
"eval_steps_per_second": 1.157,
"step": 300
},
{
"epoch": 0.1262678534464914,
"grad_norm": 2.0234892490546614,
"learning_rate": 9.660618996798294e-06,
"loss": 0.89021435,
"memory(GiB)": 63.59,
"step": 305,
"train_speed(iter/s)": 0.02005
},
{
"epoch": 0.12833781825708962,
"grad_norm": 2.3363054707830195,
"learning_rate": 9.649946638207045e-06,
"loss": 0.87254162,
"memory(GiB)": 63.59,
"step": 310,
"train_speed(iter/s)": 0.02006
},
{
"epoch": 0.13040778306768785,
"grad_norm": 1.820741232007416,
"learning_rate": 9.639274279615796e-06,
"loss": 0.90060663,
"memory(GiB)": 63.59,
"step": 315,
"train_speed(iter/s)": 0.020066
},
{
"epoch": 0.13247774787828606,
"grad_norm": 1.9084755504752218,
"learning_rate": 9.628601921024547e-06,
"loss": 0.8869771,
"memory(GiB)": 63.59,
"step": 320,
"train_speed(iter/s)": 0.020075
},
{
"epoch": 0.1345477126888843,
"grad_norm": 1.8856515103808584,
"learning_rate": 9.617929562433298e-06,
"loss": 0.87808056,
"memory(GiB)": 63.59,
"step": 325,
"train_speed(iter/s)": 0.020084
},
{
"epoch": 0.1366176774994825,
"grad_norm": 1.9384672045466198,
"learning_rate": 9.60725720384205e-06,
"loss": 0.88911896,
"memory(GiB)": 63.59,
"step": 330,
"train_speed(iter/s)": 0.020095
},
{
"epoch": 0.13868764231008074,
"grad_norm": 1.9948177737503383,
"learning_rate": 9.596584845250802e-06,
"loss": 0.8958828,
"memory(GiB)": 63.59,
"step": 335,
"train_speed(iter/s)": 0.020101
},
{
"epoch": 0.14075760712067895,
"grad_norm": 2.2317608749634874,
"learning_rate": 9.585912486659551e-06,
"loss": 0.89190845,
"memory(GiB)": 63.59,
"step": 340,
"train_speed(iter/s)": 0.020112
},
{
"epoch": 0.14282757193127718,
"grad_norm": 2.1298996429991908,
"learning_rate": 9.575240128068304e-06,
"loss": 0.87826462,
"memory(GiB)": 63.59,
"step": 345,
"train_speed(iter/s)": 0.02012
},
{
"epoch": 0.1448975367418754,
"grad_norm": 1.9367356654913552,
"learning_rate": 9.564567769477055e-06,
"loss": 0.88935146,
"memory(GiB)": 63.59,
"step": 350,
"train_speed(iter/s)": 0.020128
},
{
"epoch": 0.1469675015524736,
"grad_norm": 2.965286627679984,
"learning_rate": 9.553895410885806e-06,
"loss": 0.91815538,
"memory(GiB)": 63.59,
"step": 355,
"train_speed(iter/s)": 0.020135
},
{
"epoch": 0.14903746636307183,
"grad_norm": 2.322705848413725,
"learning_rate": 9.543223052294557e-06,
"loss": 0.89534588,
"memory(GiB)": 63.59,
"step": 360,
"train_speed(iter/s)": 0.020143
},
{
"epoch": 0.15110743117367004,
"grad_norm": 1.9467115924709606,
"learning_rate": 9.53255069370331e-06,
"loss": 0.87547607,
"memory(GiB)": 63.59,
"step": 365,
"train_speed(iter/s)": 0.020146
},
{
"epoch": 0.15317739598426827,
"grad_norm": 2.056244423727218,
"learning_rate": 9.521878335112061e-06,
"loss": 0.87522736,
"memory(GiB)": 63.59,
"step": 370,
"train_speed(iter/s)": 0.02015
},
{
"epoch": 0.15524736079486648,
"grad_norm": 1.9570323722204157,
"learning_rate": 9.511205976520812e-06,
"loss": 0.88565502,
"memory(GiB)": 63.59,
"step": 375,
"train_speed(iter/s)": 0.020154
},
{
"epoch": 0.15731732560546471,
"grad_norm": 1.9075809920338722,
"learning_rate": 9.500533617929563e-06,
"loss": 0.87662697,
"memory(GiB)": 63.59,
"step": 380,
"train_speed(iter/s)": 0.020159
},
{
"epoch": 0.15938729041606292,
"grad_norm": 2.3674969791857983,
"learning_rate": 9.489861259338314e-06,
"loss": 0.88602619,
"memory(GiB)": 63.59,
"step": 385,
"train_speed(iter/s)": 0.020163
},
{
"epoch": 0.16145725522666116,
"grad_norm": 2.157304860709474,
"learning_rate": 9.479188900747067e-06,
"loss": 0.86379642,
"memory(GiB)": 63.59,
"step": 390,
"train_speed(iter/s)": 0.02017
},
{
"epoch": 0.16352722003725936,
"grad_norm": 1.8795995948139297,
"learning_rate": 9.468516542155816e-06,
"loss": 0.88137684,
"memory(GiB)": 63.59,
"step": 395,
"train_speed(iter/s)": 0.020176
},
{
"epoch": 0.1655971848478576,
"grad_norm": 1.8996836205094734,
"learning_rate": 9.457844183564569e-06,
"loss": 0.86949444,
"memory(GiB)": 63.59,
"step": 400,
"train_speed(iter/s)": 0.020182
},
{
"epoch": 0.1676671496584558,
"grad_norm": 2.0977604679128854,
"learning_rate": 9.44717182497332e-06,
"loss": 0.85557442,
"memory(GiB)": 63.59,
"step": 405,
"train_speed(iter/s)": 0.020184
},
{
"epoch": 0.169737114469054,
"grad_norm": 2.2253818762342155,
"learning_rate": 9.436499466382071e-06,
"loss": 0.85497751,
"memory(GiB)": 63.59,
"step": 410,
"train_speed(iter/s)": 0.02019
},
{
"epoch": 0.17180707927965225,
"grad_norm": 1.9007105346828383,
"learning_rate": 9.425827107790822e-06,
"loss": 0.86072025,
"memory(GiB)": 63.59,
"step": 415,
"train_speed(iter/s)": 0.020193
},
{
"epoch": 0.17387704409025045,
"grad_norm": 2.1940409564689656,
"learning_rate": 9.415154749199575e-06,
"loss": 0.89283857,
"memory(GiB)": 63.59,
"step": 420,
"train_speed(iter/s)": 0.020198
},
{
"epoch": 0.1759470089008487,
"grad_norm": 2.2711317580338912,
"learning_rate": 9.404482390608326e-06,
"loss": 0.8422184,
"memory(GiB)": 63.59,
"step": 425,
"train_speed(iter/s)": 0.020202
},
{
"epoch": 0.1780169737114469,
"grad_norm": 1.9734252369885248,
"learning_rate": 9.393810032017077e-06,
"loss": 0.90587616,
"memory(GiB)": 63.59,
"step": 430,
"train_speed(iter/s)": 0.020208
},
{
"epoch": 0.18008693852204513,
"grad_norm": 2.1358019624149653,
"learning_rate": 9.383137673425828e-06,
"loss": 0.87137203,
"memory(GiB)": 63.59,
"step": 435,
"train_speed(iter/s)": 0.020212
},
{
"epoch": 0.18215690333264334,
"grad_norm": 2.136663123639741,
"learning_rate": 9.372465314834579e-06,
"loss": 0.84276152,
"memory(GiB)": 63.59,
"step": 440,
"train_speed(iter/s)": 0.020218
},
{
"epoch": 0.18422686814324157,
"grad_norm": 2.1730526677654005,
"learning_rate": 9.361792956243332e-06,
"loss": 0.86879997,
"memory(GiB)": 63.59,
"step": 445,
"train_speed(iter/s)": 0.020222
},
{
"epoch": 0.18629683295383978,
"grad_norm": 1.7007840288725673,
"learning_rate": 9.351120597652081e-06,
"loss": 0.85356216,
"memory(GiB)": 63.59,
"step": 450,
"train_speed(iter/s)": 0.020224
},
{
"epoch": 0.18836679776443802,
"grad_norm": 2.333224503644692,
"learning_rate": 9.340448239060834e-06,
"loss": 0.87298975,
"memory(GiB)": 63.59,
"step": 455,
"train_speed(iter/s)": 0.02023
},
{
"epoch": 0.19043676257503622,
"grad_norm": 1.7497884985137717,
"learning_rate": 9.329775880469585e-06,
"loss": 0.89487724,
"memory(GiB)": 63.59,
"step": 460,
"train_speed(iter/s)": 0.02023
},
{
"epoch": 0.19250672738563446,
"grad_norm": 1.7881559718064066,
"learning_rate": 9.319103521878336e-06,
"loss": 0.88939381,
"memory(GiB)": 63.59,
"step": 465,
"train_speed(iter/s)": 0.020232
},
{
"epoch": 0.19457669219623266,
"grad_norm": 2.5056663267756605,
"learning_rate": 9.308431163287087e-06,
"loss": 0.85123787,
"memory(GiB)": 63.59,
"step": 470,
"train_speed(iter/s)": 0.020236
},
{
"epoch": 0.19664665700683087,
"grad_norm": 2.2621024798210403,
"learning_rate": 9.29775880469584e-06,
"loss": 0.87380323,
"memory(GiB)": 63.59,
"step": 475,
"train_speed(iter/s)": 0.020238
},
{
"epoch": 0.1987166218174291,
"grad_norm": 1.9008868824283842,
"learning_rate": 9.287086446104589e-06,
"loss": 0.82988033,
"memory(GiB)": 63.59,
"step": 480,
"train_speed(iter/s)": 0.020241
},
{
"epoch": 0.2007865866280273,
"grad_norm": 1.6960491773696469,
"learning_rate": 9.276414087513342e-06,
"loss": 0.85842476,
"memory(GiB)": 63.59,
"step": 485,
"train_speed(iter/s)": 0.020243
},
{
"epoch": 0.20285655143862555,
"grad_norm": 1.8755094068628242,
"learning_rate": 9.265741728922093e-06,
"loss": 0.86819458,
"memory(GiB)": 63.59,
"step": 490,
"train_speed(iter/s)": 0.020249
},
{
"epoch": 0.20492651624922376,
"grad_norm": 1.6029255395235227,
"learning_rate": 9.255069370330844e-06,
"loss": 0.86832209,
"memory(GiB)": 63.59,
"step": 495,
"train_speed(iter/s)": 0.020253
},
{
"epoch": 0.206996481059822,
"grad_norm": 1.8312986531673774,
"learning_rate": 9.244397011739595e-06,
"loss": 0.87804108,
"memory(GiB)": 63.59,
"step": 500,
"train_speed(iter/s)": 0.020256
},
{
"epoch": 0.2090664458704202,
"grad_norm": 2.2159879518724686,
"learning_rate": 9.233724653148346e-06,
"loss": 0.84401827,
"memory(GiB)": 63.59,
"step": 505,
"train_speed(iter/s)": 0.020258
},
{
"epoch": 0.21113641068101843,
"grad_norm": 1.9489190185976173,
"learning_rate": 9.223052294557098e-06,
"loss": 0.83081837,
"memory(GiB)": 63.59,
"step": 510,
"train_speed(iter/s)": 0.020262
},
{
"epoch": 0.21320637549161664,
"grad_norm": 1.8621375658015202,
"learning_rate": 9.21237993596585e-06,
"loss": 0.84451389,
"memory(GiB)": 63.59,
"step": 515,
"train_speed(iter/s)": 0.020265
},
{
"epoch": 0.21527634030221487,
"grad_norm": 2.657592267470185,
"learning_rate": 9.2017075773746e-06,
"loss": 0.85252399,
"memory(GiB)": 63.59,
"step": 520,
"train_speed(iter/s)": 0.020272
},
{
"epoch": 0.21734630511281308,
"grad_norm": 3.2134734541192556,
"learning_rate": 9.191035218783352e-06,
"loss": 0.85981674,
"memory(GiB)": 63.59,
"step": 525,
"train_speed(iter/s)": 0.020278
},
{
"epoch": 0.2194162699234113,
"grad_norm": 1.979014196110588,
"learning_rate": 9.180362860192104e-06,
"loss": 0.85360508,
"memory(GiB)": 63.59,
"step": 530,
"train_speed(iter/s)": 0.020281
},
{
"epoch": 0.22148623473400952,
"grad_norm": 2.263346027010783,
"learning_rate": 9.169690501600854e-06,
"loss": 0.8649641,
"memory(GiB)": 63.59,
"step": 535,
"train_speed(iter/s)": 0.020285
},
{
"epoch": 0.22355619954460773,
"grad_norm": 2.0257701801426786,
"learning_rate": 9.159018143009606e-06,
"loss": 0.85079117,
"memory(GiB)": 63.59,
"step": 540,
"train_speed(iter/s)": 0.020291
},
{
"epoch": 0.22562616435520597,
"grad_norm": 2.125007231598407,
"learning_rate": 9.148345784418357e-06,
"loss": 0.83591347,
"memory(GiB)": 63.59,
"step": 545,
"train_speed(iter/s)": 0.020292
},
{
"epoch": 0.22769612916580417,
"grad_norm": 1.830826573395782,
"learning_rate": 9.137673425827108e-06,
"loss": 0.8597187,
"memory(GiB)": 63.59,
"step": 550,
"train_speed(iter/s)": 0.020297
},
{
"epoch": 0.2297660939764024,
"grad_norm": 2.145744349446719,
"learning_rate": 9.12700106723586e-06,
"loss": 0.82627001,
"memory(GiB)": 63.59,
"step": 555,
"train_speed(iter/s)": 0.020299
},
{
"epoch": 0.23183605878700062,
"grad_norm": 1.895819945079046,
"learning_rate": 9.116328708644612e-06,
"loss": 0.83418722,
"memory(GiB)": 63.59,
"step": 560,
"train_speed(iter/s)": 0.020301
},
{
"epoch": 0.23390602359759885,
"grad_norm": 1.7330277741570008,
"learning_rate": 9.105656350053362e-06,
"loss": 0.83898754,
"memory(GiB)": 63.59,
"step": 565,
"train_speed(iter/s)": 0.020301
},
{
"epoch": 0.23597598840819706,
"grad_norm": 1.873262235276853,
"learning_rate": 9.094983991462114e-06,
"loss": 0.86104965,
"memory(GiB)": 63.59,
"step": 570,
"train_speed(iter/s)": 0.020307
},
{
"epoch": 0.2380459532187953,
"grad_norm": 1.8133555971052358,
"learning_rate": 9.084311632870865e-06,
"loss": 0.83760166,
"memory(GiB)": 63.59,
"step": 575,
"train_speed(iter/s)": 0.020309
},
{
"epoch": 0.2401159180293935,
"grad_norm": 2.1026280097135377,
"learning_rate": 9.073639274279616e-06,
"loss": 0.85497513,
"memory(GiB)": 63.59,
"step": 580,
"train_speed(iter/s)": 0.02031
},
{
"epoch": 0.2421858828399917,
"grad_norm": 1.7385609176743078,
"learning_rate": 9.062966915688367e-06,
"loss": 0.83206367,
"memory(GiB)": 63.59,
"step": 585,
"train_speed(iter/s)": 0.02031
},
{
"epoch": 0.24425584765058994,
"grad_norm": 2.310226819070514,
"learning_rate": 9.052294557097118e-06,
"loss": 0.82417412,
"memory(GiB)": 63.59,
"step": 590,
"train_speed(iter/s)": 0.020312
},
{
"epoch": 0.24632581246118815,
"grad_norm": 2.1660135303280126,
"learning_rate": 9.041622198505871e-06,
"loss": 0.8371232,
"memory(GiB)": 63.59,
"step": 595,
"train_speed(iter/s)": 0.020312
},
{
"epoch": 0.24839577727178638,
"grad_norm": 2.0932218548460493,
"learning_rate": 9.030949839914622e-06,
"loss": 0.86303692,
"memory(GiB)": 63.59,
"step": 600,
"train_speed(iter/s)": 0.020315
},
{
"epoch": 0.24839577727178638,
"eval_loss": 0.8761223554611206,
"eval_runtime": 333.7076,
"eval_samples_per_second": 18.717,
"eval_steps_per_second": 1.172,
"step": 600
},
{
"epoch": 0.2504657420823846,
"grad_norm": 1.9850417129561133,
"learning_rate": 9.020277481323373e-06,
"loss": 0.84975281,
"memory(GiB)": 63.59,
"step": 605,
"train_speed(iter/s)": 0.02006
},
{
"epoch": 0.2525357068929828,
"grad_norm": 1.8024791223082373,
"learning_rate": 9.009605122732124e-06,
"loss": 0.83602209,
"memory(GiB)": 63.59,
"step": 610,
"train_speed(iter/s)": 0.020063
},
{
"epoch": 0.25460567170358106,
"grad_norm": 1.795807849269691,
"learning_rate": 8.998932764140877e-06,
"loss": 0.84287434,
"memory(GiB)": 63.59,
"step": 615,
"train_speed(iter/s)": 0.020066
},
{
"epoch": 0.25667563651417924,
"grad_norm": 2.8462701485274855,
"learning_rate": 8.988260405549626e-06,
"loss": 0.86969414,
"memory(GiB)": 63.59,
"step": 620,
"train_speed(iter/s)": 0.020069
},
{
"epoch": 0.2587456013247775,
"grad_norm": 1.9467464102992238,
"learning_rate": 8.977588046958379e-06,
"loss": 0.84205284,
"memory(GiB)": 63.59,
"step": 625,
"train_speed(iter/s)": 0.020075
},
{
"epoch": 0.2608155661353757,
"grad_norm": 1.9359113111268293,
"learning_rate": 8.96691568836713e-06,
"loss": 0.83059912,
"memory(GiB)": 63.59,
"step": 630,
"train_speed(iter/s)": 0.02008
},
{
"epoch": 0.26288553094597394,
"grad_norm": 2.269649830017561,
"learning_rate": 8.956243329775881e-06,
"loss": 0.85204124,
"memory(GiB)": 63.59,
"step": 635,
"train_speed(iter/s)": 0.020082
},
{
"epoch": 0.2649554957565721,
"grad_norm": 2.0739328945699014,
"learning_rate": 8.945570971184632e-06,
"loss": 0.83141527,
"memory(GiB)": 63.59,
"step": 640,
"train_speed(iter/s)": 0.020085
},
{
"epoch": 0.26702546056717036,
"grad_norm": 2.39308971265692,
"learning_rate": 8.934898612593383e-06,
"loss": 0.82764578,
"memory(GiB)": 63.59,
"step": 645,
"train_speed(iter/s)": 0.020088
},
{
"epoch": 0.2690954253777686,
"grad_norm": 2.014414879589864,
"learning_rate": 8.924226254002136e-06,
"loss": 0.8550128,
"memory(GiB)": 63.59,
"step": 650,
"train_speed(iter/s)": 0.020095
},
{
"epoch": 0.2711653901883668,
"grad_norm": 2.8374721965360887,
"learning_rate": 8.913553895410887e-06,
"loss": 0.86329079,
"memory(GiB)": 63.59,
"step": 655,
"train_speed(iter/s)": 0.020098
},
{
"epoch": 0.273235354998965,
"grad_norm": 1.9583062241735367,
"learning_rate": 8.902881536819638e-06,
"loss": 0.83884621,
"memory(GiB)": 63.59,
"step": 660,
"train_speed(iter/s)": 0.020101
},
{
"epoch": 0.27530531980956324,
"grad_norm": 1.765892011718539,
"learning_rate": 8.892209178228389e-06,
"loss": 0.86448555,
"memory(GiB)": 63.59,
"step": 665,
"train_speed(iter/s)": 0.020105
},
{
"epoch": 0.2773752846201615,
"grad_norm": 1.7474590269235404,
"learning_rate": 8.88153681963714e-06,
"loss": 0.81313992,
"memory(GiB)": 63.59,
"step": 670,
"train_speed(iter/s)": 0.020107
},
{
"epoch": 0.27944524943075966,
"grad_norm": 1.9301834126279824,
"learning_rate": 8.870864461045891e-06,
"loss": 0.8374301,
"memory(GiB)": 63.59,
"step": 675,
"train_speed(iter/s)": 0.02011
},
{
"epoch": 0.2815152142413579,
"grad_norm": 1.876790685008959,
"learning_rate": 8.860192102454644e-06,
"loss": 0.82494678,
"memory(GiB)": 63.59,
"step": 680,
"train_speed(iter/s)": 0.020115
},
{
"epoch": 0.2835851790519561,
"grad_norm": 1.7854143077330529,
"learning_rate": 8.849519743863395e-06,
"loss": 0.83442841,
"memory(GiB)": 63.59,
"step": 685,
"train_speed(iter/s)": 0.020116
},
{
"epoch": 0.28565514386255436,
"grad_norm": 1.8923637542669056,
"learning_rate": 8.838847385272146e-06,
"loss": 0.82085514,
"memory(GiB)": 63.72,
"step": 690,
"train_speed(iter/s)": 0.020118
},
{
"epoch": 0.28772510867315254,
"grad_norm": 2.0170179705017066,
"learning_rate": 8.828175026680897e-06,
"loss": 0.85945168,
"memory(GiB)": 63.72,
"step": 695,
"train_speed(iter/s)": 0.020121
},
{
"epoch": 0.2897950734837508,
"grad_norm": 3.3823068740775755,
"learning_rate": 8.817502668089648e-06,
"loss": 0.82407131,
"memory(GiB)": 63.72,
"step": 700,
"train_speed(iter/s)": 0.020123
},
{
"epoch": 0.291865038294349,
"grad_norm": 1.9439481100139924,
"learning_rate": 8.806830309498399e-06,
"loss": 0.82105274,
"memory(GiB)": 63.72,
"step": 705,
"train_speed(iter/s)": 0.020126
},
{
"epoch": 0.2939350031049472,
"grad_norm": 1.8993570568677929,
"learning_rate": 8.796157950907152e-06,
"loss": 0.83391781,
"memory(GiB)": 63.72,
"step": 710,
"train_speed(iter/s)": 0.020131
},
{
"epoch": 0.2960049679155454,
"grad_norm": 2.175929579998878,
"learning_rate": 8.785485592315903e-06,
"loss": 0.84003325,
"memory(GiB)": 63.72,
"step": 715,
"train_speed(iter/s)": 0.020137
},
{
"epoch": 0.29807493272614366,
"grad_norm": 1.6910798969618672,
"learning_rate": 8.774813233724654e-06,
"loss": 0.82005548,
"memory(GiB)": 63.72,
"step": 720,
"train_speed(iter/s)": 0.02014
},
{
"epoch": 0.3001448975367419,
"grad_norm": 1.7577054177826072,
"learning_rate": 8.764140875133405e-06,
"loss": 0.84406672,
"memory(GiB)": 63.72,
"step": 725,
"train_speed(iter/s)": 0.020144
},
{
"epoch": 0.3022148623473401,
"grad_norm": 1.9343300270246129,
"learning_rate": 8.753468516542156e-06,
"loss": 0.81861668,
"memory(GiB)": 63.72,
"step": 730,
"train_speed(iter/s)": 0.020144
},
{
"epoch": 0.3042848271579383,
"grad_norm": 1.8986244788103208,
"learning_rate": 8.742796157950909e-06,
"loss": 0.81786537,
"memory(GiB)": 63.72,
"step": 735,
"train_speed(iter/s)": 0.020148
},
{
"epoch": 0.30635479196853654,
"grad_norm": 2.095799409220846,
"learning_rate": 8.73212379935966e-06,
"loss": 0.83321962,
"memory(GiB)": 63.72,
"step": 740,
"train_speed(iter/s)": 0.020151
},
{
"epoch": 0.3084247567791348,
"grad_norm": 1.9094006901482394,
"learning_rate": 8.72145144076841e-06,
"loss": 0.82653723,
"memory(GiB)": 63.72,
"step": 745,
"train_speed(iter/s)": 0.020152
},
{
"epoch": 0.31049472158973296,
"grad_norm": 2.126120113530993,
"learning_rate": 8.710779082177162e-06,
"loss": 0.85463696,
"memory(GiB)": 63.72,
"step": 750,
"train_speed(iter/s)": 0.020156
},
{
"epoch": 0.3125646864003312,
"grad_norm": 1.766780713214732,
"learning_rate": 8.700106723585913e-06,
"loss": 0.83797083,
"memory(GiB)": 63.72,
"step": 755,
"train_speed(iter/s)": 0.020157
},
{
"epoch": 0.31463465121092943,
"grad_norm": 1.8957319688723608,
"learning_rate": 8.689434364994664e-06,
"loss": 0.81888847,
"memory(GiB)": 63.72,
"step": 760,
"train_speed(iter/s)": 0.020159
},
{
"epoch": 0.3167046160215276,
"grad_norm": 1.9661061189594824,
"learning_rate": 8.678762006403417e-06,
"loss": 0.78800874,
"memory(GiB)": 63.72,
"step": 765,
"train_speed(iter/s)": 0.020164
},
{
"epoch": 0.31877458083212584,
"grad_norm": 1.8837863956075926,
"learning_rate": 8.668089647812166e-06,
"loss": 0.84463196,
"memory(GiB)": 63.72,
"step": 770,
"train_speed(iter/s)": 0.020167
},
{
"epoch": 0.3208445456427241,
"grad_norm": 2.5248078655238326,
"learning_rate": 8.657417289220919e-06,
"loss": 0.83094559,
"memory(GiB)": 63.72,
"step": 775,
"train_speed(iter/s)": 0.02017
},
{
"epoch": 0.3229145104533223,
"grad_norm": 1.8996595550385447,
"learning_rate": 8.64674493062967e-06,
"loss": 0.81705608,
"memory(GiB)": 63.72,
"step": 780,
"train_speed(iter/s)": 0.020173
},
{
"epoch": 0.3249844752639205,
"grad_norm": 1.8243459235808355,
"learning_rate": 8.63607257203842e-06,
"loss": 0.82983418,
"memory(GiB)": 63.72,
"step": 785,
"train_speed(iter/s)": 0.020177
},
{
"epoch": 0.3270544400745187,
"grad_norm": 2.125198435674726,
"learning_rate": 8.625400213447172e-06,
"loss": 0.85153885,
"memory(GiB)": 63.72,
"step": 790,
"train_speed(iter/s)": 0.02018
},
{
"epoch": 0.32912440488511696,
"grad_norm": 1.822527258966965,
"learning_rate": 8.614727854855925e-06,
"loss": 0.7932189,
"memory(GiB)": 63.72,
"step": 795,
"train_speed(iter/s)": 0.020184
},
{
"epoch": 0.3311943696957152,
"grad_norm": 1.9585269031801074,
"learning_rate": 8.604055496264676e-06,
"loss": 0.80502253,
"memory(GiB)": 63.72,
"step": 800,
"train_speed(iter/s)": 0.020188
},
{
"epoch": 0.3332643345063134,
"grad_norm": 1.9244862407118186,
"learning_rate": 8.593383137673427e-06,
"loss": 0.81400661,
"memory(GiB)": 63.72,
"step": 805,
"train_speed(iter/s)": 0.020192
},
{
"epoch": 0.3353342993169116,
"grad_norm": 1.8781928942945239,
"learning_rate": 8.582710779082178e-06,
"loss": 0.82624207,
"memory(GiB)": 63.72,
"step": 810,
"train_speed(iter/s)": 0.020195
},
{
"epoch": 0.33740426412750985,
"grad_norm": 2.4821098212553108,
"learning_rate": 8.572038420490929e-06,
"loss": 0.81296177,
"memory(GiB)": 63.72,
"step": 815,
"train_speed(iter/s)": 0.020197
},
{
"epoch": 0.339474228938108,
"grad_norm": 3.2468832100225877,
"learning_rate": 8.561366061899681e-06,
"loss": 0.81447935,
"memory(GiB)": 63.72,
"step": 820,
"train_speed(iter/s)": 0.0202
},
{
"epoch": 0.34154419374870626,
"grad_norm": 1.726217016729622,
"learning_rate": 8.55069370330843e-06,
"loss": 0.82119083,
"memory(GiB)": 63.72,
"step": 825,
"train_speed(iter/s)": 0.020203
},
{
"epoch": 0.3436141585593045,
"grad_norm": 1.8200397633087098,
"learning_rate": 8.540021344717184e-06,
"loss": 0.80688438,
"memory(GiB)": 63.72,
"step": 830,
"train_speed(iter/s)": 0.020205
},
{
"epoch": 0.34568412336990273,
"grad_norm": 1.7077741062644576,
"learning_rate": 8.529348986125935e-06,
"loss": 0.83244801,
"memory(GiB)": 63.72,
"step": 835,
"train_speed(iter/s)": 0.020205
},
{
"epoch": 0.3477540881805009,
"grad_norm": 2.582896676288874,
"learning_rate": 8.518676627534686e-06,
"loss": 0.81135302,
"memory(GiB)": 63.72,
"step": 840,
"train_speed(iter/s)": 0.020208
},
{
"epoch": 0.34982405299109914,
"grad_norm": 3.4613638587514033,
"learning_rate": 8.508004268943437e-06,
"loss": 0.80561113,
"memory(GiB)": 63.72,
"step": 845,
"train_speed(iter/s)": 0.020209
},
{
"epoch": 0.3518940178016974,
"grad_norm": 1.6179386547462884,
"learning_rate": 8.49733191035219e-06,
"loss": 0.82198238,
"memory(GiB)": 63.72,
"step": 850,
"train_speed(iter/s)": 0.020211
},
{
"epoch": 0.3539639826122956,
"grad_norm": 2.202413903162471,
"learning_rate": 8.48665955176094e-06,
"loss": 0.78598285,
"memory(GiB)": 63.72,
"step": 855,
"train_speed(iter/s)": 0.020214
},
{
"epoch": 0.3560339474228938,
"grad_norm": 1.9513315920239633,
"learning_rate": 8.475987193169691e-06,
"loss": 0.80893326,
"memory(GiB)": 63.72,
"step": 860,
"train_speed(iter/s)": 0.020219
},
{
"epoch": 0.358103912233492,
"grad_norm": 1.9113374189570778,
"learning_rate": 8.465314834578443e-06,
"loss": 0.8136569,
"memory(GiB)": 63.72,
"step": 865,
"train_speed(iter/s)": 0.020222
},
{
"epoch": 0.36017387704409026,
"grad_norm": 2.084935583050277,
"learning_rate": 8.454642475987194e-06,
"loss": 0.81384058,
"memory(GiB)": 63.72,
"step": 870,
"train_speed(iter/s)": 0.020226
},
{
"epoch": 0.3622438418546885,
"grad_norm": 1.6048226105298027,
"learning_rate": 8.443970117395945e-06,
"loss": 0.81689348,
"memory(GiB)": 63.72,
"step": 875,
"train_speed(iter/s)": 0.020228
},
{
"epoch": 0.3643138066652867,
"grad_norm": 1.8081549724032602,
"learning_rate": 8.433297758804696e-06,
"loss": 0.8224082,
"memory(GiB)": 63.72,
"step": 880,
"train_speed(iter/s)": 0.02023
},
{
"epoch": 0.3663837714758849,
"grad_norm": 1.8184484923663322,
"learning_rate": 8.422625400213448e-06,
"loss": 0.78473282,
"memory(GiB)": 63.72,
"step": 885,
"train_speed(iter/s)": 0.020232
},
{
"epoch": 0.36845373628648315,
"grad_norm": 2.010882441005616,
"learning_rate": 8.4119530416222e-06,
"loss": 0.81135426,
"memory(GiB)": 63.72,
"step": 890,
"train_speed(iter/s)": 0.020232
},
{
"epoch": 0.3705237010970813,
"grad_norm": 2.363919887534564,
"learning_rate": 8.40128068303095e-06,
"loss": 0.80090466,
"memory(GiB)": 63.72,
"step": 895,
"train_speed(iter/s)": 0.020234
},
{
"epoch": 0.37259366590767956,
"grad_norm": 1.6332844070852461,
"learning_rate": 8.390608324439701e-06,
"loss": 0.81239138,
"memory(GiB)": 63.72,
"step": 900,
"train_speed(iter/s)": 0.020237
},
{
"epoch": 0.37259366590767956,
"eval_loss": 0.8537026047706604,
"eval_runtime": 333.2325,
"eval_samples_per_second": 18.744,
"eval_steps_per_second": 1.173,
"step": 900
},
{
"epoch": 0.3746636307182778,
"grad_norm": 2.4425379037284607,
"learning_rate": 8.379935965848454e-06,
"loss": 0.81980333,
"memory(GiB)": 63.72,
"step": 905,
"train_speed(iter/s)": 0.02007
},
{
"epoch": 0.37673359552887603,
"grad_norm": 2.2417018048030575,
"learning_rate": 8.369263607257204e-06,
"loss": 0.82684288,
"memory(GiB)": 63.72,
"step": 910,
"train_speed(iter/s)": 0.020071
},
{
"epoch": 0.3788035603394742,
"grad_norm": 1.7587001424417825,
"learning_rate": 8.358591248665956e-06,
"loss": 0.83183241,
"memory(GiB)": 63.72,
"step": 915,
"train_speed(iter/s)": 0.020074
},
{
"epoch": 0.38087352515007245,
"grad_norm": 1.5739208911232379,
"learning_rate": 8.347918890074707e-06,
"loss": 0.76680841,
"memory(GiB)": 63.72,
"step": 920,
"train_speed(iter/s)": 0.020076
},
{
"epoch": 0.3829434899606707,
"grad_norm": 1.8684171867799126,
"learning_rate": 8.337246531483458e-06,
"loss": 0.85463772,
"memory(GiB)": 63.72,
"step": 925,
"train_speed(iter/s)": 0.020079
},
{
"epoch": 0.3850134547712689,
"grad_norm": 1.8550376770414303,
"learning_rate": 8.32657417289221e-06,
"loss": 0.81911144,
"memory(GiB)": 63.72,
"step": 930,
"train_speed(iter/s)": 0.020083
},
{
"epoch": 0.3870834195818671,
"grad_norm": 1.8194471444369447,
"learning_rate": 8.31590181430096e-06,
"loss": 0.83044968,
"memory(GiB)": 63.72,
"step": 935,
"train_speed(iter/s)": 0.020086
},
{
"epoch": 0.38915338439246533,
"grad_norm": 2.095169413730359,
"learning_rate": 8.305229455709713e-06,
"loss": 0.82608871,
"memory(GiB)": 63.72,
"step": 940,
"train_speed(iter/s)": 0.020088
},
{
"epoch": 0.39122334920306356,
"grad_norm": 13.855393242400119,
"learning_rate": 8.294557097118464e-06,
"loss": 0.81676388,
"memory(GiB)": 63.72,
"step": 945,
"train_speed(iter/s)": 0.02009
},
{
"epoch": 0.39329331401366174,
"grad_norm": 2.100046441650532,
"learning_rate": 8.283884738527215e-06,
"loss": 0.81071377,
"memory(GiB)": 63.72,
"step": 950,
"train_speed(iter/s)": 0.020093
},
{
"epoch": 0.39536327882426,
"grad_norm": 2.1608254386705594,
"learning_rate": 8.273212379935966e-06,
"loss": 0.78902674,
"memory(GiB)": 63.72,
"step": 955,
"train_speed(iter/s)": 0.020095
},
{
"epoch": 0.3974332436348582,
"grad_norm": 1.61490095503505,
"learning_rate": 8.262540021344719e-06,
"loss": 0.78527632,
"memory(GiB)": 63.72,
"step": 960,
"train_speed(iter/s)": 0.020097
},
{
"epoch": 0.39950320844545645,
"grad_norm": 2.589460194979307,
"learning_rate": 8.251867662753468e-06,
"loss": 0.81925201,
"memory(GiB)": 63.72,
"step": 965,
"train_speed(iter/s)": 0.020098
},
{
"epoch": 0.4015731732560546,
"grad_norm": 1.8550853889727008,
"learning_rate": 8.241195304162221e-06,
"loss": 0.80688972,
"memory(GiB)": 63.72,
"step": 970,
"train_speed(iter/s)": 0.020101
},
{
"epoch": 0.40364313806665286,
"grad_norm": 2.4637090355397517,
"learning_rate": 8.230522945570972e-06,
"loss": 0.82061405,
"memory(GiB)": 63.72,
"step": 975,
"train_speed(iter/s)": 0.020104
},
{
"epoch": 0.4057131028772511,
"grad_norm": 1.9846780448601058,
"learning_rate": 8.219850586979723e-06,
"loss": 0.80861397,
"memory(GiB)": 63.72,
"step": 980,
"train_speed(iter/s)": 0.020106
},
{
"epoch": 0.40778306768784933,
"grad_norm": 1.9138045073506678,
"learning_rate": 8.209178228388474e-06,
"loss": 0.79171362,
"memory(GiB)": 63.72,
"step": 985,
"train_speed(iter/s)": 0.020108
},
{
"epoch": 0.4098530324984475,
"grad_norm": 2.100897386160194,
"learning_rate": 8.198505869797227e-06,
"loss": 0.78543482,
"memory(GiB)": 63.72,
"step": 990,
"train_speed(iter/s)": 0.020113
},
{
"epoch": 0.41192299730904575,
"grad_norm": 2.1398479736163667,
"learning_rate": 8.187833511205976e-06,
"loss": 0.79959226,
"memory(GiB)": 63.72,
"step": 995,
"train_speed(iter/s)": 0.020114
},
{
"epoch": 0.413992962119644,
"grad_norm": 1.8543885416746075,
"learning_rate": 8.177161152614729e-06,
"loss": 0.80144148,
"memory(GiB)": 63.72,
"step": 1000,
"train_speed(iter/s)": 0.020116
},
{
"epoch": 0.41606292693024216,
"grad_norm": 1.8895963845216188,
"learning_rate": 8.16648879402348e-06,
"loss": 0.78557086,
"memory(GiB)": 63.72,
"step": 1005,
"train_speed(iter/s)": 0.020118
},
{
"epoch": 0.4181328917408404,
"grad_norm": 1.7583886118404264,
"learning_rate": 8.155816435432231e-06,
"loss": 0.81441746,
"memory(GiB)": 63.72,
"step": 1010,
"train_speed(iter/s)": 0.02012
},
{
"epoch": 0.42020285655143863,
"grad_norm": 1.8640464710188405,
"learning_rate": 8.145144076840982e-06,
"loss": 0.76718016,
"memory(GiB)": 63.72,
"step": 1015,
"train_speed(iter/s)": 0.020122
},
{
"epoch": 0.42227282136203687,
"grad_norm": 2.0754981449007084,
"learning_rate": 8.134471718249733e-06,
"loss": 0.78537526,
"memory(GiB)": 63.72,
"step": 1020,
"train_speed(iter/s)": 0.020125
},
{
"epoch": 0.42434278617263504,
"grad_norm": 2.1358764475250105,
"learning_rate": 8.123799359658486e-06,
"loss": 0.82194328,
"memory(GiB)": 63.72,
"step": 1025,
"train_speed(iter/s)": 0.020128
},
{
"epoch": 0.4264127509832333,
"grad_norm": 1.940572767867165,
"learning_rate": 8.113127001067237e-06,
"loss": 0.8162715,
"memory(GiB)": 63.72,
"step": 1030,
"train_speed(iter/s)": 0.020131
},
{
"epoch": 0.4284827157938315,
"grad_norm": 1.7824953515185047,
"learning_rate": 8.102454642475988e-06,
"loss": 0.78834782,
"memory(GiB)": 63.72,
"step": 1035,
"train_speed(iter/s)": 0.020133
},
{
"epoch": 0.43055268060442975,
"grad_norm": 1.9585541886433688,
"learning_rate": 8.091782283884739e-06,
"loss": 0.79206867,
"memory(GiB)": 63.72,
"step": 1040,
"train_speed(iter/s)": 0.020136
},
{
"epoch": 0.43262264541502793,
"grad_norm": 1.6194935665114412,
"learning_rate": 8.081109925293492e-06,
"loss": 0.80889845,
"memory(GiB)": 63.72,
"step": 1045,
"train_speed(iter/s)": 0.020139
},
{
"epoch": 0.43469261022562616,
"grad_norm": 1.5909296898104581,
"learning_rate": 8.070437566702241e-06,
"loss": 0.76998816,
"memory(GiB)": 63.72,
"step": 1050,
"train_speed(iter/s)": 0.020141
},
{
"epoch": 0.4367625750362244,
"grad_norm": 1.582985265467202,
"learning_rate": 8.059765208110994e-06,
"loss": 0.79827099,
"memory(GiB)": 63.72,
"step": 1055,
"train_speed(iter/s)": 0.020143
},
{
"epoch": 0.4388325398468226,
"grad_norm": 1.9696406410447012,
"learning_rate": 8.049092849519743e-06,
"loss": 0.79220991,
"memory(GiB)": 63.72,
"step": 1060,
"train_speed(iter/s)": 0.020146
},
{
"epoch": 0.4409025046574208,
"grad_norm": 1.9479888997003834,
"learning_rate": 8.038420490928496e-06,
"loss": 0.79184585,
"memory(GiB)": 63.72,
"step": 1065,
"train_speed(iter/s)": 0.020148
},
{
"epoch": 0.44297246946801905,
"grad_norm": 1.7883498032309324,
"learning_rate": 8.027748132337247e-06,
"loss": 0.78507504,
"memory(GiB)": 63.72,
"step": 1070,
"train_speed(iter/s)": 0.02015
},
{
"epoch": 0.4450424342786173,
"grad_norm": 1.6985331753731079,
"learning_rate": 8.017075773745998e-06,
"loss": 0.81149197,
"memory(GiB)": 63.72,
"step": 1075,
"train_speed(iter/s)": 0.020154
},
{
"epoch": 0.44711239908921546,
"grad_norm": 1.7646873640943033,
"learning_rate": 8.006403415154749e-06,
"loss": 0.77548814,
"memory(GiB)": 63.72,
"step": 1080,
"train_speed(iter/s)": 0.020158
},
{
"epoch": 0.4491823638998137,
"grad_norm": 1.7739180508215708,
"learning_rate": 7.995731056563502e-06,
"loss": 0.77309542,
"memory(GiB)": 63.72,
"step": 1085,
"train_speed(iter/s)": 0.02016
},
{
"epoch": 0.45125232871041193,
"grad_norm": 1.6366153166920923,
"learning_rate": 7.985058697972253e-06,
"loss": 0.80448093,
"memory(GiB)": 63.72,
"step": 1090,
"train_speed(iter/s)": 0.020163
},
{
"epoch": 0.45332229352101017,
"grad_norm": 1.9520729703143727,
"learning_rate": 7.974386339381004e-06,
"loss": 0.78142538,
"memory(GiB)": 63.72,
"step": 1095,
"train_speed(iter/s)": 0.020164
},
{
"epoch": 0.45539225833160835,
"grad_norm": 1.9086497373347489,
"learning_rate": 7.963713980789755e-06,
"loss": 0.7876678,
"memory(GiB)": 63.72,
"step": 1100,
"train_speed(iter/s)": 0.020167
},
{
"epoch": 0.4574622231422066,
"grad_norm": 1.773963819363606,
"learning_rate": 7.953041622198506e-06,
"loss": 0.817309,
"memory(GiB)": 63.72,
"step": 1105,
"train_speed(iter/s)": 0.020169
},
{
"epoch": 0.4595321879528048,
"grad_norm": 1.7610901122064158,
"learning_rate": 7.942369263607259e-06,
"loss": 0.81472855,
"memory(GiB)": 63.72,
"step": 1110,
"train_speed(iter/s)": 0.020171
},
{
"epoch": 0.461602152763403,
"grad_norm": 1.8809865504177992,
"learning_rate": 7.931696905016008e-06,
"loss": 0.79487166,
"memory(GiB)": 63.72,
"step": 1115,
"train_speed(iter/s)": 0.020174
},
{
"epoch": 0.46367211757400123,
"grad_norm": 1.8810868009507724,
"learning_rate": 7.92102454642476e-06,
"loss": 0.78505554,
"memory(GiB)": 63.72,
"step": 1120,
"train_speed(iter/s)": 0.020176
},
{
"epoch": 0.46574208238459947,
"grad_norm": 1.7134608698183469,
"learning_rate": 7.910352187833512e-06,
"loss": 0.77789249,
"memory(GiB)": 63.72,
"step": 1125,
"train_speed(iter/s)": 0.020179
},
{
"epoch": 0.4678120471951977,
"grad_norm": 1.9145455941813492,
"learning_rate": 7.899679829242263e-06,
"loss": 0.76669245,
"memory(GiB)": 63.72,
"step": 1130,
"train_speed(iter/s)": 0.020182
},
{
"epoch": 0.4698820120057959,
"grad_norm": 1.9752917655252427,
"learning_rate": 7.889007470651014e-06,
"loss": 0.77915101,
"memory(GiB)": 63.72,
"step": 1135,
"train_speed(iter/s)": 0.020184
},
{
"epoch": 0.4719519768163941,
"grad_norm": 1.8705706085741929,
"learning_rate": 7.878335112059767e-06,
"loss": 0.78053985,
"memory(GiB)": 63.72,
"step": 1140,
"train_speed(iter/s)": 0.020187
},
{
"epoch": 0.47402194162699235,
"grad_norm": 1.8137417073497548,
"learning_rate": 7.867662753468518e-06,
"loss": 0.8304471,
"memory(GiB)": 63.72,
"step": 1145,
"train_speed(iter/s)": 0.020189
},
{
"epoch": 0.4760919064375906,
"grad_norm": 1.7537064971860614,
"learning_rate": 7.856990394877269e-06,
"loss": 0.76652546,
"memory(GiB)": 63.72,
"step": 1150,
"train_speed(iter/s)": 0.020192
},
{
"epoch": 0.47816187124818876,
"grad_norm": 1.8981437943138895,
"learning_rate": 7.84631803628602e-06,
"loss": 0.77384648,
"memory(GiB)": 63.72,
"step": 1155,
"train_speed(iter/s)": 0.020194
},
{
"epoch": 0.480231836058787,
"grad_norm": 1.968718081590253,
"learning_rate": 7.83564567769477e-06,
"loss": 0.81451969,
"memory(GiB)": 63.72,
"step": 1160,
"train_speed(iter/s)": 0.020196
},
{
"epoch": 0.48230180086938523,
"grad_norm": 1.9755371858466928,
"learning_rate": 7.824973319103523e-06,
"loss": 0.79220142,
"memory(GiB)": 63.72,
"step": 1165,
"train_speed(iter/s)": 0.020198
},
{
"epoch": 0.4843717656799834,
"grad_norm": 1.8485795416766981,
"learning_rate": 7.814300960512274e-06,
"loss": 0.80019064,
"memory(GiB)": 63.72,
"step": 1170,
"train_speed(iter/s)": 0.0202
},
{
"epoch": 0.48644173049058165,
"grad_norm": 2.388358446370589,
"learning_rate": 7.803628601921026e-06,
"loss": 0.8037425,
"memory(GiB)": 63.72,
"step": 1175,
"train_speed(iter/s)": 0.020202
},
{
"epoch": 0.4885116953011799,
"grad_norm": 1.7963803355457697,
"learning_rate": 7.792956243329777e-06,
"loss": 0.76506805,
"memory(GiB)": 63.72,
"step": 1180,
"train_speed(iter/s)": 0.020205
},
{
"epoch": 0.4905816601117781,
"grad_norm": 1.642582867995439,
"learning_rate": 7.782283884738528e-06,
"loss": 0.76571236,
"memory(GiB)": 63.72,
"step": 1185,
"train_speed(iter/s)": 0.020206
},
{
"epoch": 0.4926516249223763,
"grad_norm": 1.8722199369588735,
"learning_rate": 7.771611526147279e-06,
"loss": 0.81547689,
"memory(GiB)": 63.72,
"step": 1190,
"train_speed(iter/s)": 0.020209
},
{
"epoch": 0.49472158973297453,
"grad_norm": 1.6444393246271363,
"learning_rate": 7.760939167556031e-06,
"loss": 0.77196584,
"memory(GiB)": 63.72,
"step": 1195,
"train_speed(iter/s)": 0.02021
},
{
"epoch": 0.49679155454357277,
"grad_norm": 1.7322851516861686,
"learning_rate": 7.75026680896478e-06,
"loss": 0.78245749,
"memory(GiB)": 63.72,
"step": 1200,
"train_speed(iter/s)": 0.020211
},
{
"epoch": 0.49679155454357277,
"eval_loss": 0.8388283252716064,
"eval_runtime": 333.9836,
"eval_samples_per_second": 18.702,
"eval_steps_per_second": 1.171,
"step": 1200
},
{
"epoch": 0.498861519354171,
"grad_norm": 1.779853897918009,
"learning_rate": 7.739594450373533e-06,
"loss": 0.76194401,
"memory(GiB)": 63.72,
"step": 1205,
"train_speed(iter/s)": 0.020085
},
{
"epoch": 0.5009314841647692,
"grad_norm": 1.6935101575699751,
"learning_rate": 7.728922091782284e-06,
"loss": 0.76538324,
"memory(GiB)": 63.72,
"step": 1210,
"train_speed(iter/s)": 0.020088
},
{
"epoch": 0.5030014489753675,
"grad_norm": 2.1568052824101196,
"learning_rate": 7.718249733191036e-06,
"loss": 0.80119467,
"memory(GiB)": 63.72,
"step": 1215,
"train_speed(iter/s)": 0.02009
},
{
"epoch": 0.5050714137859657,
"grad_norm": 2.0414007101619815,
"learning_rate": 7.707577374599787e-06,
"loss": 0.78779106,
"memory(GiB)": 63.72,
"step": 1220,
"train_speed(iter/s)": 0.020091
},
{
"epoch": 0.5071413785965638,
"grad_norm": 1.6191663238961727,
"learning_rate": 7.69690501600854e-06,
"loss": 0.79252872,
"memory(GiB)": 63.72,
"step": 1225,
"train_speed(iter/s)": 0.020092
},
{
"epoch": 0.5092113434071621,
"grad_norm": 1.8617354993121655,
"learning_rate": 7.68623265741729e-06,
"loss": 0.76940928,
"memory(GiB)": 63.72,
"step": 1230,
"train_speed(iter/s)": 0.020095
},
{
"epoch": 0.5112813082177603,
"grad_norm": 2.1148307227706,
"learning_rate": 7.675560298826041e-06,
"loss": 0.81177235,
"memory(GiB)": 63.72,
"step": 1235,
"train_speed(iter/s)": 0.020096
},
{
"epoch": 0.5133512730283585,
"grad_norm": 1.858296305288767,
"learning_rate": 7.664887940234792e-06,
"loss": 0.78712654,
"memory(GiB)": 63.72,
"step": 1240,
"train_speed(iter/s)": 0.020097
},
{
"epoch": 0.5154212378389568,
"grad_norm": 2.040424767723149,
"learning_rate": 7.654215581643543e-06,
"loss": 0.7963089,
"memory(GiB)": 63.72,
"step": 1245,
"train_speed(iter/s)": 0.020099
},
{
"epoch": 0.517491202649555,
"grad_norm": 1.7313703601186623,
"learning_rate": 7.643543223052296e-06,
"loss": 0.76896205,
"memory(GiB)": 63.72,
"step": 1250,
"train_speed(iter/s)": 0.020102
},
{
"epoch": 0.5195611674601531,
"grad_norm": 1.6916331849372186,
"learning_rate": 7.632870864461046e-06,
"loss": 0.78258944,
"memory(GiB)": 63.72,
"step": 1255,
"train_speed(iter/s)": 0.020104
},
{
"epoch": 0.5216311322707514,
"grad_norm": 2.1058096966812303,
"learning_rate": 7.622198505869797e-06,
"loss": 0.80921211,
"memory(GiB)": 63.72,
"step": 1260,
"train_speed(iter/s)": 0.020105
},
{
"epoch": 0.5237010970813496,
"grad_norm": 1.7220759067410432,
"learning_rate": 7.611526147278549e-06,
"loss": 0.77246647,
"memory(GiB)": 63.72,
"step": 1265,
"train_speed(iter/s)": 0.020106
},
{
"epoch": 0.5257710618919479,
"grad_norm": 2.084224319084108,
"learning_rate": 7.6008537886873e-06,
"loss": 0.7677907,
"memory(GiB)": 63.72,
"step": 1270,
"train_speed(iter/s)": 0.020107
},
{
"epoch": 0.5278410267025461,
"grad_norm": 1.7928505615246706,
"learning_rate": 7.590181430096052e-06,
"loss": 0.78496704,
"memory(GiB)": 63.72,
"step": 1275,
"train_speed(iter/s)": 0.020108
},
{
"epoch": 0.5299109915131442,
"grad_norm": 1.8397320603347174,
"learning_rate": 7.579509071504803e-06,
"loss": 0.77303753,
"memory(GiB)": 63.72,
"step": 1280,
"train_speed(iter/s)": 0.02011
},
{
"epoch": 0.5319809563237425,
"grad_norm": 2.1479969295234187,
"learning_rate": 7.568836712913554e-06,
"loss": 0.75871119,
"memory(GiB)": 63.72,
"step": 1285,
"train_speed(iter/s)": 0.020112
},
{
"epoch": 0.5340509211343407,
"grad_norm": 1.94767502078934,
"learning_rate": 7.558164354322306e-06,
"loss": 0.75106993,
"memory(GiB)": 63.72,
"step": 1290,
"train_speed(iter/s)": 0.020114
},
{
"epoch": 0.5361208859449389,
"grad_norm": 1.5236425325852578,
"learning_rate": 7.547491995731058e-06,
"loss": 0.79110327,
"memory(GiB)": 63.72,
"step": 1295,
"train_speed(iter/s)": 0.020116
},
{
"epoch": 0.5381908507555372,
"grad_norm": 1.8541149671409907,
"learning_rate": 7.536819637139808e-06,
"loss": 0.77403798,
"memory(GiB)": 63.72,
"step": 1300,
"train_speed(iter/s)": 0.020116
},
{
"epoch": 0.5402608155661354,
"grad_norm": 1.8743174944996448,
"learning_rate": 7.52614727854856e-06,
"loss": 0.77032347,
"memory(GiB)": 71.94,
"step": 1305,
"train_speed(iter/s)": 0.020118
},
{
"epoch": 0.5423307803767335,
"grad_norm": 2.579806479546849,
"learning_rate": 7.51547491995731e-06,
"loss": 0.76461482,
"memory(GiB)": 71.94,
"step": 1310,
"train_speed(iter/s)": 0.020119
},
{
"epoch": 0.5444007451873318,
"grad_norm": 2.0039452129208035,
"learning_rate": 7.504802561366062e-06,
"loss": 0.77457762,
"memory(GiB)": 71.94,
"step": 1315,
"train_speed(iter/s)": 0.020121
},
{
"epoch": 0.54647070999793,
"grad_norm": 2.060283685569936,
"learning_rate": 7.494130202774814e-06,
"loss": 0.77914829,
"memory(GiB)": 71.94,
"step": 1320,
"train_speed(iter/s)": 0.020123
},
{
"epoch": 0.5485406748085283,
"grad_norm": 2.163132135636586,
"learning_rate": 7.483457844183565e-06,
"loss": 0.77322574,
"memory(GiB)": 71.94,
"step": 1325,
"train_speed(iter/s)": 0.020125
},
{
"epoch": 0.5506106396191265,
"grad_norm": 1.842195467860799,
"learning_rate": 7.472785485592316e-06,
"loss": 0.77454052,
"memory(GiB)": 71.94,
"step": 1330,
"train_speed(iter/s)": 0.020126
},
{
"epoch": 0.5526806044297247,
"grad_norm": 1.775275008552653,
"learning_rate": 7.462113127001068e-06,
"loss": 0.77025108,
"memory(GiB)": 71.94,
"step": 1335,
"train_speed(iter/s)": 0.020128
},
{
"epoch": 0.554750569240323,
"grad_norm": 2.165651142341684,
"learning_rate": 7.451440768409819e-06,
"loss": 0.78470011,
"memory(GiB)": 71.94,
"step": 1340,
"train_speed(iter/s)": 0.02013
},
{
"epoch": 0.5568205340509211,
"grad_norm": 1.6530168942960388,
"learning_rate": 7.440768409818571e-06,
"loss": 0.74261112,
"memory(GiB)": 71.94,
"step": 1345,
"train_speed(iter/s)": 0.020131
},
{
"epoch": 0.5588904988615193,
"grad_norm": 2.1178890231616694,
"learning_rate": 7.430096051227322e-06,
"loss": 0.77076225,
"memory(GiB)": 71.94,
"step": 1350,
"train_speed(iter/s)": 0.020132
},
{
"epoch": 0.5609604636721176,
"grad_norm": 1.6332209286889638,
"learning_rate": 7.419423692636073e-06,
"loss": 0.76129122,
"memory(GiB)": 71.94,
"step": 1355,
"train_speed(iter/s)": 0.020134
},
{
"epoch": 0.5630304284827158,
"grad_norm": 1.9276105656674607,
"learning_rate": 7.408751334044825e-06,
"loss": 0.77616062,
"memory(GiB)": 71.94,
"step": 1360,
"train_speed(iter/s)": 0.020136
},
{
"epoch": 0.565100393293314,
"grad_norm": 1.838664332126464,
"learning_rate": 7.398078975453575e-06,
"loss": 0.77545385,
"memory(GiB)": 71.94,
"step": 1365,
"train_speed(iter/s)": 0.020137
},
{
"epoch": 0.5671703581039123,
"grad_norm": 2.090052030958157,
"learning_rate": 7.387406616862327e-06,
"loss": 0.7824297,
"memory(GiB)": 71.94,
"step": 1370,
"train_speed(iter/s)": 0.020139
},
{
"epoch": 0.5692403229145104,
"grad_norm": 1.7799554116738177,
"learning_rate": 7.376734258271079e-06,
"loss": 0.77833185,
"memory(GiB)": 71.94,
"step": 1375,
"train_speed(iter/s)": 0.020142
},
{
"epoch": 0.5713102877251087,
"grad_norm": 2.266691996975209,
"learning_rate": 7.366061899679829e-06,
"loss": 0.77535782,
"memory(GiB)": 71.94,
"step": 1380,
"train_speed(iter/s)": 0.020144
},
{
"epoch": 0.5733802525357069,
"grad_norm": 1.8220471587007605,
"learning_rate": 7.355389541088581e-06,
"loss": 0.76158247,
"memory(GiB)": 71.94,
"step": 1385,
"train_speed(iter/s)": 0.020146
},
{
"epoch": 0.5754502173463051,
"grad_norm": 1.7869060368578336,
"learning_rate": 7.344717182497333e-06,
"loss": 0.79457912,
"memory(GiB)": 71.94,
"step": 1390,
"train_speed(iter/s)": 0.020147
},
{
"epoch": 0.5775201821569034,
"grad_norm": 2.730877403121895,
"learning_rate": 7.334044823906084e-06,
"loss": 0.75181475,
"memory(GiB)": 71.94,
"step": 1395,
"train_speed(iter/s)": 0.020149
},
{
"epoch": 0.5795901469675016,
"grad_norm": 2.091944883020518,
"learning_rate": 7.323372465314835e-06,
"loss": 0.75992446,
"memory(GiB)": 71.94,
"step": 1400,
"train_speed(iter/s)": 0.02015
},
{
"epoch": 0.5816601117780997,
"grad_norm": 1.5904822426334966,
"learning_rate": 7.312700106723587e-06,
"loss": 0.77254944,
"memory(GiB)": 71.94,
"step": 1405,
"train_speed(iter/s)": 0.020152
},
{
"epoch": 0.583730076588698,
"grad_norm": 1.673083919686743,
"learning_rate": 7.302027748132338e-06,
"loss": 0.74836388,
"memory(GiB)": 71.94,
"step": 1410,
"train_speed(iter/s)": 0.020152
},
{
"epoch": 0.5858000413992962,
"grad_norm": 2.05811523971159,
"learning_rate": 7.29135538954109e-06,
"loss": 0.74358282,
"memory(GiB)": 71.94,
"step": 1415,
"train_speed(iter/s)": 0.020154
},
{
"epoch": 0.5878700062098944,
"grad_norm": 1.983632865952002,
"learning_rate": 7.28068303094984e-06,
"loss": 0.78234367,
"memory(GiB)": 71.94,
"step": 1420,
"train_speed(iter/s)": 0.020156
},
{
"epoch": 0.5899399710204927,
"grad_norm": 1.6612296882759847,
"learning_rate": 7.270010672358592e-06,
"loss": 0.76740494,
"memory(GiB)": 71.94,
"step": 1425,
"train_speed(iter/s)": 0.020157
},
{
"epoch": 0.5920099358310908,
"grad_norm": 1.8232818202515155,
"learning_rate": 7.259338313767344e-06,
"loss": 0.76410437,
"memory(GiB)": 71.94,
"step": 1430,
"train_speed(iter/s)": 0.020159
},
{
"epoch": 0.5940799006416891,
"grad_norm": 1.6871789120586522,
"learning_rate": 7.248665955176094e-06,
"loss": 0.76673613,
"memory(GiB)": 71.94,
"step": 1435,
"train_speed(iter/s)": 0.02016
},
{
"epoch": 0.5961498654522873,
"grad_norm": 1.9181669169557467,
"learning_rate": 7.237993596584846e-06,
"loss": 0.73530726,
"memory(GiB)": 71.94,
"step": 1440,
"train_speed(iter/s)": 0.020161
},
{
"epoch": 0.5982198302628855,
"grad_norm": 2.0425311715534513,
"learning_rate": 7.227321237993598e-06,
"loss": 0.78409719,
"memory(GiB)": 71.94,
"step": 1445,
"train_speed(iter/s)": 0.020163
},
{
"epoch": 0.6002897950734838,
"grad_norm": 1.725457162133973,
"learning_rate": 7.216648879402348e-06,
"loss": 0.74391842,
"memory(GiB)": 71.94,
"step": 1450,
"train_speed(iter/s)": 0.020163
},
{
"epoch": 0.602359759884082,
"grad_norm": 2.2362927243629613,
"learning_rate": 7.2059765208111e-06,
"loss": 0.77035971,
"memory(GiB)": 71.94,
"step": 1455,
"train_speed(iter/s)": 0.020166
},
{
"epoch": 0.6044297246946801,
"grad_norm": 2.379202645179455,
"learning_rate": 7.195304162219852e-06,
"loss": 0.74266062,
"memory(GiB)": 71.94,
"step": 1460,
"train_speed(iter/s)": 0.020168
},
{
"epoch": 0.6064996895052784,
"grad_norm": 1.6006607749389805,
"learning_rate": 7.184631803628602e-06,
"loss": 0.76957574,
"memory(GiB)": 71.94,
"step": 1465,
"train_speed(iter/s)": 0.020169
},
{
"epoch": 0.6085696543158766,
"grad_norm": 1.7633012594109296,
"learning_rate": 7.173959445037354e-06,
"loss": 0.77078071,
"memory(GiB)": 71.94,
"step": 1470,
"train_speed(iter/s)": 0.02017
},
{
"epoch": 0.6106396191264748,
"grad_norm": 1.6009632285824897,
"learning_rate": 7.163287086446106e-06,
"loss": 0.7669549,
"memory(GiB)": 71.94,
"step": 1475,
"train_speed(iter/s)": 0.020171
},
{
"epoch": 0.6127095839370731,
"grad_norm": 1.932344117154099,
"learning_rate": 7.152614727854857e-06,
"loss": 0.76528344,
"memory(GiB)": 71.94,
"step": 1480,
"train_speed(iter/s)": 0.020172
},
{
"epoch": 0.6147795487476713,
"grad_norm": 2.02896587820159,
"learning_rate": 7.141942369263608e-06,
"loss": 0.75168095,
"memory(GiB)": 71.94,
"step": 1485,
"train_speed(iter/s)": 0.020174
},
{
"epoch": 0.6168495135582696,
"grad_norm": 1.9974467066335662,
"learning_rate": 7.131270010672359e-06,
"loss": 0.7488194,
"memory(GiB)": 71.94,
"step": 1490,
"train_speed(iter/s)": 0.020176
},
{
"epoch": 0.6189194783688677,
"grad_norm": 1.7902382164373858,
"learning_rate": 7.120597652081111e-06,
"loss": 0.75267982,
"memory(GiB)": 71.94,
"step": 1495,
"train_speed(iter/s)": 0.020177
},
{
"epoch": 0.6209894431794659,
"grad_norm": 2.5929739472863838,
"learning_rate": 7.1099252934898625e-06,
"loss": 0.73765378,
"memory(GiB)": 71.94,
"step": 1500,
"train_speed(iter/s)": 0.020178
},
{
"epoch": 0.6209894431794659,
"eval_loss": 0.8280953168869019,
"eval_runtime": 333.1777,
"eval_samples_per_second": 18.747,
"eval_steps_per_second": 1.174,
"step": 1500
},
{
"epoch": 0.6230594079900642,
"grad_norm": 1.9847247838289337,
"learning_rate": 7.099252934898613e-06,
"loss": 0.75908709,
"memory(GiB)": 71.94,
"step": 1505,
"train_speed(iter/s)": 0.020077
},
{
"epoch": 0.6251293728006624,
"grad_norm": 1.801176606187135,
"learning_rate": 7.088580576307365e-06,
"loss": 0.78817225,
"memory(GiB)": 71.94,
"step": 1510,
"train_speed(iter/s)": 0.020079
},
{
"epoch": 0.6271993376112606,
"grad_norm": 1.8685868923530962,
"learning_rate": 7.0779082177161165e-06,
"loss": 0.74608173,
"memory(GiB)": 71.94,
"step": 1515,
"train_speed(iter/s)": 0.02008
},
{
"epoch": 0.6292693024218589,
"grad_norm": 1.9112331285520685,
"learning_rate": 7.067235859124867e-06,
"loss": 0.78047667,
"memory(GiB)": 71.94,
"step": 1520,
"train_speed(iter/s)": 0.020081
},
{
"epoch": 0.631339267232457,
"grad_norm": 1.69066034933461,
"learning_rate": 7.0565635005336185e-06,
"loss": 0.75149813,
"memory(GiB)": 71.94,
"step": 1525,
"train_speed(iter/s)": 0.02008
},
{
"epoch": 0.6334092320430552,
"grad_norm": 1.5394308738884603,
"learning_rate": 7.0458911419423704e-06,
"loss": 0.76838903,
"memory(GiB)": 71.94,
"step": 1530,
"train_speed(iter/s)": 0.020081
},
{
"epoch": 0.6354791968536535,
"grad_norm": 1.7690454856119193,
"learning_rate": 7.035218783351121e-06,
"loss": 0.7524828,
"memory(GiB)": 71.94,
"step": 1535,
"train_speed(iter/s)": 0.020083
},
{
"epoch": 0.6375491616642517,
"grad_norm": 2.0548651954814154,
"learning_rate": 7.0245464247598725e-06,
"loss": 0.76242485,
"memory(GiB)": 71.94,
"step": 1540,
"train_speed(iter/s)": 0.020083
},
{
"epoch": 0.63961912647485,
"grad_norm": 1.5706142249866388,
"learning_rate": 7.0138740661686235e-06,
"loss": 0.78198986,
"memory(GiB)": 71.94,
"step": 1545,
"train_speed(iter/s)": 0.020084
},
{
"epoch": 0.6416890912854482,
"grad_norm": 2.9215645341915275,
"learning_rate": 7.0032017075773754e-06,
"loss": 0.74943571,
"memory(GiB)": 71.94,
"step": 1550,
"train_speed(iter/s)": 0.020086
},
{
"epoch": 0.6437590560960463,
"grad_norm": 1.983762828992232,
"learning_rate": 6.9925293489861265e-06,
"loss": 0.75862083,
"memory(GiB)": 71.94,
"step": 1555,
"train_speed(iter/s)": 0.020087
},
{
"epoch": 0.6458290209066446,
"grad_norm": 1.8256568832087245,
"learning_rate": 6.9818569903948775e-06,
"loss": 0.75860863,
"memory(GiB)": 71.94,
"step": 1560,
"train_speed(iter/s)": 0.020089
},
{
"epoch": 0.6478989857172428,
"grad_norm": 1.7082999758601491,
"learning_rate": 6.971184631803629e-06,
"loss": 0.78126869,
"memory(GiB)": 71.94,
"step": 1565,
"train_speed(iter/s)": 0.02009
},
{
"epoch": 0.649968950527841,
"grad_norm": 2.388449730753909,
"learning_rate": 6.960512273212381e-06,
"loss": 0.76754818,
"memory(GiB)": 71.94,
"step": 1570,
"train_speed(iter/s)": 0.020091
},
{
"epoch": 0.6520389153384393,
"grad_norm": 2.570709108294184,
"learning_rate": 6.9498399146211315e-06,
"loss": 0.74245424,
"memory(GiB)": 71.94,
"step": 1575,
"train_speed(iter/s)": 0.020093
},
{
"epoch": 0.6541088801490375,
"grad_norm": 1.942646301485773,
"learning_rate": 6.939167556029883e-06,
"loss": 0.77264175,
"memory(GiB)": 71.94,
"step": 1580,
"train_speed(iter/s)": 0.020094
},
{
"epoch": 0.6561788449596356,
"grad_norm": 1.7902561992868253,
"learning_rate": 6.928495197438635e-06,
"loss": 0.76845627,
"memory(GiB)": 71.94,
"step": 1585,
"train_speed(iter/s)": 0.020096
},
{
"epoch": 0.6582488097702339,
"grad_norm": 1.8253085296468832,
"learning_rate": 6.9178228388473854e-06,
"loss": 0.75771255,
"memory(GiB)": 71.94,
"step": 1590,
"train_speed(iter/s)": 0.020097
},
{
"epoch": 0.6603187745808321,
"grad_norm": 1.8440321320283706,
"learning_rate": 6.907150480256137e-06,
"loss": 0.74345121,
"memory(GiB)": 71.94,
"step": 1595,
"train_speed(iter/s)": 0.020096
},
{
"epoch": 0.6623887393914304,
"grad_norm": 1.8894245153228149,
"learning_rate": 6.896478121664889e-06,
"loss": 0.76188393,
"memory(GiB)": 71.94,
"step": 1600,
"train_speed(iter/s)": 0.020098
},
{
"epoch": 0.6644587042020286,
"grad_norm": 2.055312059883184,
"learning_rate": 6.885805763073639e-06,
"loss": 0.75247483,
"memory(GiB)": 71.94,
"step": 1605,
"train_speed(iter/s)": 0.020099
},
{
"epoch": 0.6665286690126268,
"grad_norm": 1.8131130404445874,
"learning_rate": 6.875133404482391e-06,
"loss": 0.77208357,
"memory(GiB)": 71.94,
"step": 1610,
"train_speed(iter/s)": 0.0201
},
{
"epoch": 0.668598633823225,
"grad_norm": 1.721876122278255,
"learning_rate": 6.864461045891142e-06,
"loss": 0.7271523,
"memory(GiB)": 71.94,
"step": 1615,
"train_speed(iter/s)": 0.020101
},
{
"epoch": 0.6706685986338232,
"grad_norm": 1.880195637198508,
"learning_rate": 6.853788687299893e-06,
"loss": 0.74756432,
"memory(GiB)": 71.94,
"step": 1620,
"train_speed(iter/s)": 0.020101
},
{
"epoch": 0.6727385634444214,
"grad_norm": 1.5895741550527986,
"learning_rate": 6.843116328708645e-06,
"loss": 0.77960148,
"memory(GiB)": 71.94,
"step": 1625,
"train_speed(iter/s)": 0.020102
},
{
"epoch": 0.6748085282550197,
"grad_norm": 1.762407790878705,
"learning_rate": 6.832443970117396e-06,
"loss": 0.76564293,
"memory(GiB)": 71.94,
"step": 1630,
"train_speed(iter/s)": 0.020104
},
{
"epoch": 0.6768784930656179,
"grad_norm": 1.85152265865232,
"learning_rate": 6.821771611526148e-06,
"loss": 0.75755472,
"memory(GiB)": 71.94,
"step": 1635,
"train_speed(iter/s)": 0.020104
},
{
"epoch": 0.678948457876216,
"grad_norm": 2.4048195292609464,
"learning_rate": 6.811099252934899e-06,
"loss": 0.75722828,
"memory(GiB)": 71.94,
"step": 1640,
"train_speed(iter/s)": 0.020106
},
{
"epoch": 0.6810184226868143,
"grad_norm": 1.7461290969223273,
"learning_rate": 6.80042689434365e-06,
"loss": 0.74719772,
"memory(GiB)": 71.94,
"step": 1645,
"train_speed(iter/s)": 0.020107
},
{
"epoch": 0.6830883874974125,
"grad_norm": 1.7606017366047548,
"learning_rate": 6.789754535752402e-06,
"loss": 0.74246426,
"memory(GiB)": 71.94,
"step": 1650,
"train_speed(iter/s)": 0.020108
},
{
"epoch": 0.6851583523080108,
"grad_norm": 2.3484261163252884,
"learning_rate": 6.779082177161154e-06,
"loss": 0.7567915,
"memory(GiB)": 71.94,
"step": 1655,
"train_speed(iter/s)": 0.020109
},
{
"epoch": 0.687228317118609,
"grad_norm": 1.686698632081635,
"learning_rate": 6.768409818569904e-06,
"loss": 0.73500414,
"memory(GiB)": 71.94,
"step": 1660,
"train_speed(iter/s)": 0.020111
},
{
"epoch": 0.6892982819292072,
"grad_norm": 1.9785908023609375,
"learning_rate": 6.757737459978656e-06,
"loss": 0.7035881,
"memory(GiB)": 71.94,
"step": 1665,
"train_speed(iter/s)": 0.020112
},
{
"epoch": 0.6913682467398055,
"grad_norm": 1.8288827641332985,
"learning_rate": 6.747065101387406e-06,
"loss": 0.74135156,
"memory(GiB)": 71.94,
"step": 1670,
"train_speed(iter/s)": 0.020113
},
{
"epoch": 0.6934382115504036,
"grad_norm": 2.106219884662748,
"learning_rate": 6.736392742796158e-06,
"loss": 0.77649341,
"memory(GiB)": 71.94,
"step": 1675,
"train_speed(iter/s)": 0.020115
},
{
"epoch": 0.6955081763610018,
"grad_norm": 1.857981089347382,
"learning_rate": 6.72572038420491e-06,
"loss": 0.73271251,
"memory(GiB)": 71.94,
"step": 1680,
"train_speed(iter/s)": 0.020116
},
{
"epoch": 0.6975781411716001,
"grad_norm": 1.8252469751324223,
"learning_rate": 6.715048025613661e-06,
"loss": 0.76072979,
"memory(GiB)": 71.94,
"step": 1685,
"train_speed(iter/s)": 0.020117
},
{
"epoch": 0.6996481059821983,
"grad_norm": 1.9875787155351985,
"learning_rate": 6.704375667022412e-06,
"loss": 0.73438239,
"memory(GiB)": 71.94,
"step": 1690,
"train_speed(iter/s)": 0.020118
},
{
"epoch": 0.7017180707927966,
"grad_norm": 1.685302389303672,
"learning_rate": 6.693703308431164e-06,
"loss": 0.76429882,
"memory(GiB)": 71.94,
"step": 1695,
"train_speed(iter/s)": 0.020119
},
{
"epoch": 0.7037880356033948,
"grad_norm": 2.30374573526697,
"learning_rate": 6.683030949839915e-06,
"loss": 0.7437336,
"memory(GiB)": 71.94,
"step": 1700,
"train_speed(iter/s)": 0.020119
},
{
"epoch": 0.7058580004139929,
"grad_norm": 2.4577356463267104,
"learning_rate": 6.672358591248667e-06,
"loss": 0.79406719,
"memory(GiB)": 71.94,
"step": 1705,
"train_speed(iter/s)": 0.02012
},
{
"epoch": 0.7079279652245912,
"grad_norm": 1.7489028068953179,
"learning_rate": 6.661686232657418e-06,
"loss": 0.75482893,
"memory(GiB)": 71.94,
"step": 1710,
"train_speed(iter/s)": 0.020121
},
{
"epoch": 0.7099979300351894,
"grad_norm": 1.929474801980816,
"learning_rate": 6.651013874066169e-06,
"loss": 0.74136767,
"memory(GiB)": 71.94,
"step": 1715,
"train_speed(iter/s)": 0.020122
},
{
"epoch": 0.7120678948457876,
"grad_norm": 1.6790168198096502,
"learning_rate": 6.640341515474921e-06,
"loss": 0.72392588,
"memory(GiB)": 71.94,
"step": 1720,
"train_speed(iter/s)": 0.020122
},
{
"epoch": 0.7141378596563859,
"grad_norm": 2.2963610149429488,
"learning_rate": 6.629669156883671e-06,
"loss": 0.7462635,
"memory(GiB)": 71.94,
"step": 1725,
"train_speed(iter/s)": 0.020123
},
{
"epoch": 0.716207824466984,
"grad_norm": 4.289784718847475,
"learning_rate": 6.618996798292423e-06,
"loss": 0.73541126,
"memory(GiB)": 71.94,
"step": 1730,
"train_speed(iter/s)": 0.020124
},
{
"epoch": 0.7182777892775822,
"grad_norm": 2.1972884462976263,
"learning_rate": 6.608324439701175e-06,
"loss": 0.7353497,
"memory(GiB)": 71.94,
"step": 1735,
"train_speed(iter/s)": 0.020124
},
{
"epoch": 0.7203477540881805,
"grad_norm": 2.1738189409828377,
"learning_rate": 6.597652081109925e-06,
"loss": 0.71738148,
"memory(GiB)": 71.94,
"step": 1740,
"train_speed(iter/s)": 0.020126
},
{
"epoch": 0.7224177188987787,
"grad_norm": 1.6342074890059992,
"learning_rate": 6.586979722518677e-06,
"loss": 0.75047336,
"memory(GiB)": 71.94,
"step": 1745,
"train_speed(iter/s)": 0.020126
},
{
"epoch": 0.724487683709377,
"grad_norm": 1.7007570391919413,
"learning_rate": 6.576307363927429e-06,
"loss": 0.73253298,
"memory(GiB)": 71.94,
"step": 1750,
"train_speed(iter/s)": 0.020127
},
{
"epoch": 0.7265576485199752,
"grad_norm": 1.5323053950217638,
"learning_rate": 6.56563500533618e-06,
"loss": 0.74062099,
"memory(GiB)": 71.94,
"step": 1755,
"train_speed(iter/s)": 0.020128
},
{
"epoch": 0.7286276133305734,
"grad_norm": 1.9624071404199714,
"learning_rate": 6.554962646744931e-06,
"loss": 0.76860294,
"memory(GiB)": 71.94,
"step": 1760,
"train_speed(iter/s)": 0.02013
},
{
"epoch": 0.7306975781411716,
"grad_norm": 1.8145041855689747,
"learning_rate": 6.544290288153683e-06,
"loss": 0.72403975,
"memory(GiB)": 71.94,
"step": 1765,
"train_speed(iter/s)": 0.020131
},
{
"epoch": 0.7327675429517698,
"grad_norm": 1.7793196071264126,
"learning_rate": 6.533617929562434e-06,
"loss": 0.76407566,
"memory(GiB)": 71.94,
"step": 1770,
"train_speed(iter/s)": 0.020132
},
{
"epoch": 0.734837507762368,
"grad_norm": 1.8806974947113853,
"learning_rate": 6.522945570971186e-06,
"loss": 0.73674612,
"memory(GiB)": 71.94,
"step": 1775,
"train_speed(iter/s)": 0.020133
},
{
"epoch": 0.7369074725729663,
"grad_norm": 1.6203999356727887,
"learning_rate": 6.512273212379937e-06,
"loss": 0.73720975,
"memory(GiB)": 71.94,
"step": 1780,
"train_speed(iter/s)": 0.020135
},
{
"epoch": 0.7389774373835645,
"grad_norm": 1.8256501665131275,
"learning_rate": 6.501600853788688e-06,
"loss": 0.74560528,
"memory(GiB)": 71.94,
"step": 1785,
"train_speed(iter/s)": 0.020136
},
{
"epoch": 0.7410474021941627,
"grad_norm": 2.3313828860511294,
"learning_rate": 6.49092849519744e-06,
"loss": 0.73726311,
"memory(GiB)": 71.94,
"step": 1790,
"train_speed(iter/s)": 0.020137
},
{
"epoch": 0.7431173670047609,
"grad_norm": 2.01967250245603,
"learning_rate": 6.48025613660619e-06,
"loss": 0.72599983,
"memory(GiB)": 71.94,
"step": 1795,
"train_speed(iter/s)": 0.020138
},
{
"epoch": 0.7451873318153591,
"grad_norm": 1.5873931113082191,
"learning_rate": 6.469583778014942e-06,
"loss": 0.72361288,
"memory(GiB)": 71.94,
"step": 1800,
"train_speed(iter/s)": 0.020139
},
{
"epoch": 0.7451873318153591,
"eval_loss": 0.821691632270813,
"eval_runtime": 333.5584,
"eval_samples_per_second": 18.725,
"eval_steps_per_second": 1.172,
"step": 1800
},
{
"epoch": 0.7472572966259574,
"grad_norm": 1.6284031081935817,
"learning_rate": 6.458911419423694e-06,
"loss": 0.73620996,
"memory(GiB)": 71.94,
"step": 1805,
"train_speed(iter/s)": 0.020055
},
{
"epoch": 0.7493272614365556,
"grad_norm": 1.8143587680892548,
"learning_rate": 6.448239060832444e-06,
"loss": 0.73436375,
"memory(GiB)": 71.94,
"step": 1810,
"train_speed(iter/s)": 0.020056
},
{
"epoch": 0.7513972262471538,
"grad_norm": 1.9502300953167138,
"learning_rate": 6.437566702241196e-06,
"loss": 0.74698448,
"memory(GiB)": 71.94,
"step": 1815,
"train_speed(iter/s)": 0.020057
},
{
"epoch": 0.7534671910577521,
"grad_norm": 1.72820315761644,
"learning_rate": 6.426894343649948e-06,
"loss": 0.75661831,
"memory(GiB)": 71.94,
"step": 1820,
"train_speed(iter/s)": 0.020059
},
{
"epoch": 0.7555371558683502,
"grad_norm": 1.8039931329917855,
"learning_rate": 6.416221985058698e-06,
"loss": 0.74954405,
"memory(GiB)": 71.94,
"step": 1825,
"train_speed(iter/s)": 0.020059
},
{
"epoch": 0.7576071206789484,
"grad_norm": 2.0711566925028433,
"learning_rate": 6.40554962646745e-06,
"loss": 0.73745756,
"memory(GiB)": 71.94,
"step": 1830,
"train_speed(iter/s)": 0.020061
},
{
"epoch": 0.7596770854895467,
"grad_norm": 1.826394447351557,
"learning_rate": 6.3948772678762016e-06,
"loss": 0.73249888,
"memory(GiB)": 71.94,
"step": 1835,
"train_speed(iter/s)": 0.020062
},
{
"epoch": 0.7617470503001449,
"grad_norm": 2.4929262063136175,
"learning_rate": 6.384204909284953e-06,
"loss": 0.72507439,
"memory(GiB)": 71.94,
"step": 1840,
"train_speed(iter/s)": 0.020062
},
{
"epoch": 0.7638170151107431,
"grad_norm": 1.7606115982834467,
"learning_rate": 6.373532550693704e-06,
"loss": 0.72618227,
"memory(GiB)": 71.94,
"step": 1845,
"train_speed(iter/s)": 0.020064
},
{
"epoch": 0.7658869799213414,
"grad_norm": 1.7780633999979434,
"learning_rate": 6.362860192102455e-06,
"loss": 0.73577065,
"memory(GiB)": 71.94,
"step": 1850,
"train_speed(iter/s)": 0.020064
},
{
"epoch": 0.7679569447319395,
"grad_norm": 2.2901271183050294,
"learning_rate": 6.3521878335112066e-06,
"loss": 0.75972033,
"memory(GiB)": 71.94,
"step": 1855,
"train_speed(iter/s)": 0.020065
},
{
"epoch": 0.7700269095425378,
"grad_norm": 1.974709118996213,
"learning_rate": 6.3415154749199585e-06,
"loss": 0.7513834,
"memory(GiB)": 71.94,
"step": 1860,
"train_speed(iter/s)": 0.020065
},
{
"epoch": 0.772096874353136,
"grad_norm": 1.7795619053561953,
"learning_rate": 6.330843116328709e-06,
"loss": 0.70549402,
"memory(GiB)": 71.94,
"step": 1865,
"train_speed(iter/s)": 0.020066
},
{
"epoch": 0.7741668391637342,
"grad_norm": 1.7925210555170064,
"learning_rate": 6.3201707577374605e-06,
"loss": 0.75538568,
"memory(GiB)": 71.94,
"step": 1870,
"train_speed(iter/s)": 0.020067
},
{
"epoch": 0.7762368039743325,
"grad_norm": 1.7592519440523378,
"learning_rate": 6.309498399146212e-06,
"loss": 0.74328322,
"memory(GiB)": 71.94,
"step": 1875,
"train_speed(iter/s)": 0.020068
},
{
"epoch": 0.7783067687849307,
"grad_norm": 1.7630568722667896,
"learning_rate": 6.298826040554963e-06,
"loss": 0.73995676,
"memory(GiB)": 71.94,
"step": 1880,
"train_speed(iter/s)": 0.020069
},
{
"epoch": 0.7803767335955288,
"grad_norm": 1.732047211183728,
"learning_rate": 6.2881536819637145e-06,
"loss": 0.70023623,
"memory(GiB)": 71.94,
"step": 1885,
"train_speed(iter/s)": 0.02007
},
{
"epoch": 0.7824466984061271,
"grad_norm": 1.7004814074017778,
"learning_rate": 6.277481323372466e-06,
"loss": 0.73256617,
"memory(GiB)": 71.94,
"step": 1890,
"train_speed(iter/s)": 0.020071
},
{
"epoch": 0.7845166632167253,
"grad_norm": 1.7170761577962488,
"learning_rate": 6.2668089647812166e-06,
"loss": 0.71450982,
"memory(GiB)": 71.94,
"step": 1895,
"train_speed(iter/s)": 0.020071
},
{
"epoch": 0.7865866280273235,
"grad_norm": 2.0086204171681565,
"learning_rate": 6.2561366061899685e-06,
"loss": 0.7096673,
"memory(GiB)": 71.94,
"step": 1900,
"train_speed(iter/s)": 0.020072
},
{
"epoch": 0.7886565928379218,
"grad_norm": 1.6109075949007228,
"learning_rate": 6.2454642475987195e-06,
"loss": 0.7170536,
"memory(GiB)": 71.94,
"step": 1905,
"train_speed(iter/s)": 0.020073
},
{
"epoch": 0.79072655764852,
"grad_norm": 1.6468982825229455,
"learning_rate": 6.234791889007471e-06,
"loss": 0.7817646,
"memory(GiB)": 71.94,
"step": 1910,
"train_speed(iter/s)": 0.020073
},
{
"epoch": 0.7927965224591182,
"grad_norm": 1.8405361482523723,
"learning_rate": 6.224119530416222e-06,
"loss": 0.71389322,
"memory(GiB)": 71.94,
"step": 1915,
"train_speed(iter/s)": 0.020075
},
{
"epoch": 0.7948664872697164,
"grad_norm": 1.7937559729338877,
"learning_rate": 6.2134471718249735e-06,
"loss": 0.72494421,
"memory(GiB)": 71.94,
"step": 1920,
"train_speed(iter/s)": 0.020076
},
{
"epoch": 0.7969364520803146,
"grad_norm": 1.98762799360225,
"learning_rate": 6.202774813233725e-06,
"loss": 0.75637407,
"memory(GiB)": 71.94,
"step": 1925,
"train_speed(iter/s)": 0.020077
},
{
"epoch": 0.7990064168909129,
"grad_norm": 2.469167716565665,
"learning_rate": 6.192102454642477e-06,
"loss": 0.71725979,
"memory(GiB)": 71.94,
"step": 1930,
"train_speed(iter/s)": 0.020078
},
{
"epoch": 0.8010763817015111,
"grad_norm": 1.6526117746871118,
"learning_rate": 6.181430096051227e-06,
"loss": 0.73172369,
"memory(GiB)": 71.94,
"step": 1935,
"train_speed(iter/s)": 0.020079
},
{
"epoch": 0.8031463465121093,
"grad_norm": 1.8881085526929478,
"learning_rate": 6.170757737459979e-06,
"loss": 0.68869176,
"memory(GiB)": 71.94,
"step": 1940,
"train_speed(iter/s)": 0.02008
},
{
"epoch": 0.8052163113227075,
"grad_norm": 2.1341112552467107,
"learning_rate": 6.160085378868731e-06,
"loss": 0.74425526,
"memory(GiB)": 71.94,
"step": 1945,
"train_speed(iter/s)": 0.020081
},
{
"epoch": 0.8072862761333057,
"grad_norm": 2.1587279161379906,
"learning_rate": 6.149413020277481e-06,
"loss": 0.72686901,
"memory(GiB)": 71.94,
"step": 1950,
"train_speed(iter/s)": 0.020081
},
{
"epoch": 0.8093562409439039,
"grad_norm": 1.7183166805211196,
"learning_rate": 6.138740661686233e-06,
"loss": 0.70801015,
"memory(GiB)": 71.94,
"step": 1955,
"train_speed(iter/s)": 0.020082
},
{
"epoch": 0.8114262057545022,
"grad_norm": 1.6918548000232365,
"learning_rate": 6.128068303094985e-06,
"loss": 0.71407743,
"memory(GiB)": 71.94,
"step": 1960,
"train_speed(iter/s)": 0.020083
},
{
"epoch": 0.8134961705651004,
"grad_norm": 1.6783017839137153,
"learning_rate": 6.117395944503735e-06,
"loss": 0.74968853,
"memory(GiB)": 71.94,
"step": 1965,
"train_speed(iter/s)": 0.020083
},
{
"epoch": 0.8155661353756987,
"grad_norm": 1.7117864547494115,
"learning_rate": 6.106723585912487e-06,
"loss": 0.71134882,
"memory(GiB)": 71.94,
"step": 1970,
"train_speed(iter/s)": 0.020085
},
{
"epoch": 0.8176361001862968,
"grad_norm": 1.9043254368072582,
"learning_rate": 6.096051227321238e-06,
"loss": 0.74376335,
"memory(GiB)": 71.94,
"step": 1975,
"train_speed(iter/s)": 0.020085
},
{
"epoch": 0.819706064996895,
"grad_norm": 1.8416766338299921,
"learning_rate": 6.08537886872999e-06,
"loss": 0.74048576,
"memory(GiB)": 71.94,
"step": 1980,
"train_speed(iter/s)": 0.020086
},
{
"epoch": 0.8217760298074933,
"grad_norm": 1.9993092375152783,
"learning_rate": 6.074706510138741e-06,
"loss": 0.73070145,
"memory(GiB)": 71.94,
"step": 1985,
"train_speed(iter/s)": 0.020087
},
{
"epoch": 0.8238459946180915,
"grad_norm": 1.9255287807426156,
"learning_rate": 6.064034151547492e-06,
"loss": 0.71732969,
"memory(GiB)": 71.94,
"step": 1990,
"train_speed(iter/s)": 0.020088
},
{
"epoch": 0.8259159594286897,
"grad_norm": 1.699758287154301,
"learning_rate": 6.053361792956244e-06,
"loss": 0.70977745,
"memory(GiB)": 71.94,
"step": 1995,
"train_speed(iter/s)": 0.020089
},
{
"epoch": 0.827985924239288,
"grad_norm": 1.7447876663920783,
"learning_rate": 6.042689434364995e-06,
"loss": 0.72701621,
"memory(GiB)": 71.94,
"step": 2000,
"train_speed(iter/s)": 0.02009
},
{
"epoch": 0.8300558890498861,
"grad_norm": 1.8875191736470693,
"learning_rate": 6.032017075773746e-06,
"loss": 0.72644696,
"memory(GiB)": 71.94,
"step": 2005,
"train_speed(iter/s)": 0.02009
},
{
"epoch": 0.8321258538604843,
"grad_norm": 1.6472530019204896,
"learning_rate": 6.021344717182498e-06,
"loss": 0.70005097,
"memory(GiB)": 71.94,
"step": 2010,
"train_speed(iter/s)": 0.020091
},
{
"epoch": 0.8341958186710826,
"grad_norm": 2.2818513681921266,
"learning_rate": 6.01067235859125e-06,
"loss": 0.72365198,
"memory(GiB)": 71.94,
"step": 2015,
"train_speed(iter/s)": 0.020092
},
{
"epoch": 0.8362657834816808,
"grad_norm": 1.8205665836409684,
"learning_rate": 6e-06,
"loss": 0.7322978,
"memory(GiB)": 71.94,
"step": 2020,
"train_speed(iter/s)": 0.020092
},
{
"epoch": 0.8383357482922791,
"grad_norm": 1.980690292432822,
"learning_rate": 5.989327641408752e-06,
"loss": 0.73382425,
"memory(GiB)": 71.94,
"step": 2025,
"train_speed(iter/s)": 0.020093
},
{
"epoch": 0.8404057131028773,
"grad_norm": 1.7748584169287815,
"learning_rate": 5.978655282817502e-06,
"loss": 0.7543438,
"memory(GiB)": 71.94,
"step": 2030,
"train_speed(iter/s)": 0.020094
},
{
"epoch": 0.8424756779134754,
"grad_norm": 1.9477910790390784,
"learning_rate": 5.967982924226254e-06,
"loss": 0.73893185,
"memory(GiB)": 71.94,
"step": 2035,
"train_speed(iter/s)": 0.020095
},
{
"epoch": 0.8445456427240737,
"grad_norm": 1.5695526526206898,
"learning_rate": 5.957310565635006e-06,
"loss": 0.71403108,
"memory(GiB)": 71.94,
"step": 2040,
"train_speed(iter/s)": 0.020096
},
{
"epoch": 0.8466156075346719,
"grad_norm": 2.1517602299856557,
"learning_rate": 5.946638207043757e-06,
"loss": 0.71713848,
"memory(GiB)": 71.94,
"step": 2045,
"train_speed(iter/s)": 0.020097
},
{
"epoch": 0.8486855723452701,
"grad_norm": 2.739525728221928,
"learning_rate": 5.935965848452508e-06,
"loss": 0.72253714,
"memory(GiB)": 71.94,
"step": 2050,
"train_speed(iter/s)": 0.020099
},
{
"epoch": 0.8507555371558684,
"grad_norm": 1.9454548994868823,
"learning_rate": 5.92529348986126e-06,
"loss": 0.74796634,
"memory(GiB)": 71.94,
"step": 2055,
"train_speed(iter/s)": 0.0201
},
{
"epoch": 0.8528255019664666,
"grad_norm": 1.7996528216814918,
"learning_rate": 5.914621131270011e-06,
"loss": 0.71262107,
"memory(GiB)": 71.94,
"step": 2060,
"train_speed(iter/s)": 0.020101
},
{
"epoch": 0.8548954667770647,
"grad_norm": 1.9001952709840753,
"learning_rate": 5.903948772678763e-06,
"loss": 0.71814876,
"memory(GiB)": 71.94,
"step": 2065,
"train_speed(iter/s)": 0.020101
},
{
"epoch": 0.856965431587663,
"grad_norm": 3.4631327121051036,
"learning_rate": 5.893276414087514e-06,
"loss": 0.70359154,
"memory(GiB)": 71.94,
"step": 2070,
"train_speed(iter/s)": 0.020103
},
{
"epoch": 0.8590353963982612,
"grad_norm": 1.9078035163840932,
"learning_rate": 5.882604055496265e-06,
"loss": 0.74100571,
"memory(GiB)": 71.94,
"step": 2075,
"train_speed(iter/s)": 0.020103
},
{
"epoch": 0.8611053612088595,
"grad_norm": 2.7698267455997576,
"learning_rate": 5.871931696905017e-06,
"loss": 0.71972656,
"memory(GiB)": 71.94,
"step": 2080,
"train_speed(iter/s)": 0.020104
},
{
"epoch": 0.8631753260194577,
"grad_norm": 1.9640858230267009,
"learning_rate": 5.861259338313769e-06,
"loss": 0.71868258,
"memory(GiB)": 71.94,
"step": 2085,
"train_speed(iter/s)": 0.020105
},
{
"epoch": 0.8652452908300559,
"grad_norm": 1.9593324236104832,
"learning_rate": 5.850586979722519e-06,
"loss": 0.70707326,
"memory(GiB)": 71.94,
"step": 2090,
"train_speed(iter/s)": 0.020106
},
{
"epoch": 0.8673152556406541,
"grad_norm": 2.0337621679872946,
"learning_rate": 5.839914621131271e-06,
"loss": 0.7303226,
"memory(GiB)": 71.94,
"step": 2095,
"train_speed(iter/s)": 0.020107
},
{
"epoch": 0.8693852204512523,
"grad_norm": 1.7464491411508778,
"learning_rate": 5.829242262540021e-06,
"loss": 0.70045071,
"memory(GiB)": 71.94,
"step": 2100,
"train_speed(iter/s)": 0.020108
},
{
"epoch": 0.8693852204512523,
"eval_loss": 0.8155556321144104,
"eval_runtime": 334.2741,
"eval_samples_per_second": 18.685,
"eval_steps_per_second": 1.17,
"step": 2100
},
{
"epoch": 0.8714551852618505,
"grad_norm": 1.893698412703024,
"learning_rate": 5.818569903948773e-06,
"loss": 0.71434135,
"memory(GiB)": 71.94,
"step": 2105,
"train_speed(iter/s)": 0.020036
},
{
"epoch": 0.8735251500724488,
"grad_norm": 1.8915863471870793,
"learning_rate": 5.807897545357525e-06,
"loss": 0.73264565,
"memory(GiB)": 71.94,
"step": 2110,
"train_speed(iter/s)": 0.020037
},
{
"epoch": 0.875595114883047,
"grad_norm": 1.5909748405215243,
"learning_rate": 5.797225186766276e-06,
"loss": 0.6809236,
"memory(GiB)": 71.94,
"step": 2115,
"train_speed(iter/s)": 0.020037
},
{
"epoch": 0.8776650796936452,
"grad_norm": 2.298029244846505,
"learning_rate": 5.786552828175027e-06,
"loss": 0.72660675,
"memory(GiB)": 71.94,
"step": 2120,
"train_speed(iter/s)": 0.020038
},
{
"epoch": 0.8797350445042434,
"grad_norm": 2.4471148736933634,
"learning_rate": 5.775880469583779e-06,
"loss": 0.71458502,
"memory(GiB)": 71.94,
"step": 2125,
"train_speed(iter/s)": 0.020039
},
{
"epoch": 0.8818050093148416,
"grad_norm": 2.237641446928337,
"learning_rate": 5.76520811099253e-06,
"loss": 0.72527971,
"memory(GiB)": 71.94,
"step": 2130,
"train_speed(iter/s)": 0.02004
},
{
"epoch": 0.8838749741254399,
"grad_norm": 2.1373854033173667,
"learning_rate": 5.754535752401282e-06,
"loss": 0.68969226,
"memory(GiB)": 71.94,
"step": 2135,
"train_speed(iter/s)": 0.020041
},
{
"epoch": 0.8859449389360381,
"grad_norm": 2.4814505236104254,
"learning_rate": 5.743863393810033e-06,
"loss": 0.72110472,
"memory(GiB)": 71.94,
"step": 2140,
"train_speed(iter/s)": 0.020042
},
{
"epoch": 0.8880149037466363,
"grad_norm": 1.8846949427239792,
"learning_rate": 5.733191035218784e-06,
"loss": 0.71526957,
"memory(GiB)": 71.94,
"step": 2145,
"train_speed(iter/s)": 0.020043
},
{
"epoch": 0.8900848685572346,
"grad_norm": 1.6414510600406789,
"learning_rate": 5.722518676627536e-06,
"loss": 0.72875662,
"memory(GiB)": 71.94,
"step": 2150,
"train_speed(iter/s)": 0.020043
},
{
"epoch": 0.8921548333678327,
"grad_norm": 2.027904279012944,
"learning_rate": 5.711846318036286e-06,
"loss": 0.73166924,
"memory(GiB)": 71.94,
"step": 2155,
"train_speed(iter/s)": 0.020044
},
{
"epoch": 0.8942247981784309,
"grad_norm": 2.0007554119149136,
"learning_rate": 5.701173959445038e-06,
"loss": 0.69900818,
"memory(GiB)": 71.94,
"step": 2160,
"train_speed(iter/s)": 0.020046
},
{
"epoch": 0.8962947629890292,
"grad_norm": 1.732929144774659,
"learning_rate": 5.69050160085379e-06,
"loss": 0.70091386,
"memory(GiB)": 71.94,
"step": 2165,
"train_speed(iter/s)": 0.020046
},
{
"epoch": 0.8983647277996274,
"grad_norm": 1.6575807865879337,
"learning_rate": 5.67982924226254e-06,
"loss": 0.70530014,
"memory(GiB)": 71.94,
"step": 2170,
"train_speed(iter/s)": 0.020047
},
{
"epoch": 0.9004346926102256,
"grad_norm": 1.8402505669080536,
"learning_rate": 5.669156883671292e-06,
"loss": 0.72022433,
"memory(GiB)": 71.94,
"step": 2175,
"train_speed(iter/s)": 0.020048
},
{
"epoch": 0.9025046574208239,
"grad_norm": 1.8122270786550385,
"learning_rate": 5.6584845250800435e-06,
"loss": 0.67802505,
"memory(GiB)": 71.94,
"step": 2180,
"train_speed(iter/s)": 0.020048
},
{
"epoch": 0.904574622231422,
"grad_norm": 1.7969445274298348,
"learning_rate": 5.647812166488794e-06,
"loss": 0.70636382,
"memory(GiB)": 71.94,
"step": 2185,
"train_speed(iter/s)": 0.020049
},
{
"epoch": 0.9066445870420203,
"grad_norm": 2.1142537074713412,
"learning_rate": 5.637139807897546e-06,
"loss": 0.71558409,
"memory(GiB)": 71.94,
"step": 2190,
"train_speed(iter/s)": 0.02005
},
{
"epoch": 0.9087145518526185,
"grad_norm": 1.944413383820558,
"learning_rate": 5.6264674493062975e-06,
"loss": 0.72004523,
"memory(GiB)": 71.94,
"step": 2195,
"train_speed(iter/s)": 0.020051
},
{
"epoch": 0.9107845166632167,
"grad_norm": 2.3051719970776934,
"learning_rate": 5.6157950907150485e-06,
"loss": 0.7308382,
"memory(GiB)": 71.94,
"step": 2200,
"train_speed(iter/s)": 0.020052
},
{
"epoch": 0.912854481473815,
"grad_norm": 2.27411033803044,
"learning_rate": 5.6051227321238e-06,
"loss": 0.74844613,
"memory(GiB)": 71.94,
"step": 2205,
"train_speed(iter/s)": 0.020053
},
{
"epoch": 0.9149244462844132,
"grad_norm": 1.8444250783225764,
"learning_rate": 5.594450373532551e-06,
"loss": 0.68941135,
"memory(GiB)": 71.94,
"step": 2210,
"train_speed(iter/s)": 0.020053
},
{
"epoch": 0.9169944110950113,
"grad_norm": 1.8662325411124825,
"learning_rate": 5.5837780149413025e-06,
"loss": 0.73066435,
"memory(GiB)": 71.94,
"step": 2215,
"train_speed(iter/s)": 0.020054
},
{
"epoch": 0.9190643759056096,
"grad_norm": 1.6833532844662813,
"learning_rate": 5.573105656350054e-06,
"loss": 0.7062602,
"memory(GiB)": 71.94,
"step": 2220,
"train_speed(iter/s)": 0.020055
},
{
"epoch": 0.9211343407162078,
"grad_norm": 1.6070808318678096,
"learning_rate": 5.562433297758805e-06,
"loss": 0.69585543,
"memory(GiB)": 71.94,
"step": 2225,
"train_speed(iter/s)": 0.020055
},
{
"epoch": 0.923204305526806,
"grad_norm": 2.0016548598313024,
"learning_rate": 5.5517609391675565e-06,
"loss": 0.7085475,
"memory(GiB)": 71.94,
"step": 2230,
"train_speed(iter/s)": 0.020056
},
{
"epoch": 0.9252742703374043,
"grad_norm": 1.9201243304059477,
"learning_rate": 5.541088580576308e-06,
"loss": 0.71516237,
"memory(GiB)": 71.94,
"step": 2235,
"train_speed(iter/s)": 0.020057
},
{
"epoch": 0.9273442351480025,
"grad_norm": 1.9055608045022892,
"learning_rate": 5.5304162219850586e-06,
"loss": 0.69740834,
"memory(GiB)": 71.94,
"step": 2240,
"train_speed(iter/s)": 0.020058
},
{
"epoch": 0.9294141999586008,
"grad_norm": 2.0041753291659026,
"learning_rate": 5.5197438633938104e-06,
"loss": 0.71469507,
"memory(GiB)": 71.94,
"step": 2245,
"train_speed(iter/s)": 0.02006
},
{
"epoch": 0.9314841647691989,
"grad_norm": 2.025723530711083,
"learning_rate": 5.509071504802562e-06,
"loss": 0.69406614,
"memory(GiB)": 71.94,
"step": 2250,
"train_speed(iter/s)": 0.02006
},
{
"epoch": 0.9335541295797971,
"grad_norm": 1.7845786824484478,
"learning_rate": 5.4983991462113125e-06,
"loss": 0.7137248,
"memory(GiB)": 71.94,
"step": 2255,
"train_speed(iter/s)": 0.020061
},
{
"epoch": 0.9356240943903954,
"grad_norm": 2.3504717810438196,
"learning_rate": 5.487726787620064e-06,
"loss": 0.70752907,
"memory(GiB)": 71.94,
"step": 2260,
"train_speed(iter/s)": 0.020062
},
{
"epoch": 0.9376940592009936,
"grad_norm": 2.0225261644141797,
"learning_rate": 5.477054429028816e-06,
"loss": 0.70490198,
"memory(GiB)": 71.94,
"step": 2265,
"train_speed(iter/s)": 0.020063
},
{
"epoch": 0.9397640240115918,
"grad_norm": 2.2107863770119747,
"learning_rate": 5.466382070437567e-06,
"loss": 0.71501665,
"memory(GiB)": 71.94,
"step": 2270,
"train_speed(iter/s)": 0.020064
},
{
"epoch": 0.94183398882219,
"grad_norm": 1.9030684437330223,
"learning_rate": 5.455709711846318e-06,
"loss": 0.70587921,
"memory(GiB)": 71.94,
"step": 2275,
"train_speed(iter/s)": 0.020064
},
{
"epoch": 0.9439039536327882,
"grad_norm": 1.8981878877998872,
"learning_rate": 5.445037353255069e-06,
"loss": 0.70294933,
"memory(GiB)": 71.94,
"step": 2280,
"train_speed(iter/s)": 0.020064
},
{
"epoch": 0.9459739184433864,
"grad_norm": 1.7195321236677932,
"learning_rate": 5.434364994663821e-06,
"loss": 0.71657829,
"memory(GiB)": 71.94,
"step": 2285,
"train_speed(iter/s)": 0.020065
},
{
"epoch": 0.9480438832539847,
"grad_norm": 1.6695574824900545,
"learning_rate": 5.423692636072573e-06,
"loss": 0.70917149,
"memory(GiB)": 71.94,
"step": 2290,
"train_speed(iter/s)": 0.020065
},
{
"epoch": 0.9501138480645829,
"grad_norm": 1.7410897548688689,
"learning_rate": 5.413020277481323e-06,
"loss": 0.7276587,
"memory(GiB)": 71.94,
"step": 2295,
"train_speed(iter/s)": 0.020066
},
{
"epoch": 0.9521838128751812,
"grad_norm": 1.6737135024901502,
"learning_rate": 5.402347918890075e-06,
"loss": 0.70822001,
"memory(GiB)": 71.94,
"step": 2300,
"train_speed(iter/s)": 0.020066
},
{
"epoch": 0.9542537776857793,
"grad_norm": 1.7876076111815575,
"learning_rate": 5.391675560298827e-06,
"loss": 0.72815857,
"memory(GiB)": 71.94,
"step": 2305,
"train_speed(iter/s)": 0.020067
},
{
"epoch": 0.9563237424963775,
"grad_norm": 1.653921158054905,
"learning_rate": 5.381003201707577e-06,
"loss": 0.715973,
"memory(GiB)": 71.94,
"step": 2310,
"train_speed(iter/s)": 0.020068
},
{
"epoch": 0.9583937073069758,
"grad_norm": 1.7830026539914627,
"learning_rate": 5.370330843116329e-06,
"loss": 0.70955439,
"memory(GiB)": 71.94,
"step": 2315,
"train_speed(iter/s)": 0.020068
},
{
"epoch": 0.960463672117574,
"grad_norm": 2.0794293583699712,
"learning_rate": 5.359658484525081e-06,
"loss": 0.71212959,
"memory(GiB)": 71.94,
"step": 2320,
"train_speed(iter/s)": 0.020049
},
{
"epoch": 0.9625336369281722,
"grad_norm": 1.62651854843721,
"learning_rate": 5.348986125933831e-06,
"loss": 0.69347043,
"memory(GiB)": 71.94,
"step": 2325,
"train_speed(iter/s)": 0.02005
},
{
"epoch": 0.9646036017387705,
"grad_norm": 1.6345937025216515,
"learning_rate": 5.338313767342583e-06,
"loss": 0.68745365,
"memory(GiB)": 71.94,
"step": 2330,
"train_speed(iter/s)": 0.02005
},
{
"epoch": 0.9666735665493686,
"grad_norm": 2.1460062258102504,
"learning_rate": 5.327641408751334e-06,
"loss": 0.69751196,
"memory(GiB)": 71.94,
"step": 2335,
"train_speed(iter/s)": 0.020051
},
{
"epoch": 0.9687435313599668,
"grad_norm": 1.8428729242460318,
"learning_rate": 5.316969050160086e-06,
"loss": 0.70150051,
"memory(GiB)": 71.94,
"step": 2340,
"train_speed(iter/s)": 0.020052
},
{
"epoch": 0.9708134961705651,
"grad_norm": 1.870750640547904,
"learning_rate": 5.306296691568837e-06,
"loss": 0.67915797,
"memory(GiB)": 71.94,
"step": 2345,
"train_speed(iter/s)": 0.020053
},
{
"epoch": 0.9728834609811633,
"grad_norm": 1.6984421405387677,
"learning_rate": 5.295624332977588e-06,
"loss": 0.71915126,
"memory(GiB)": 71.94,
"step": 2350,
"train_speed(iter/s)": 0.020054
},
{
"epoch": 0.9749534257917616,
"grad_norm": 1.7839025594515001,
"learning_rate": 5.28495197438634e-06,
"loss": 0.69594321,
"memory(GiB)": 71.94,
"step": 2355,
"train_speed(iter/s)": 0.020055
},
{
"epoch": 0.9770233906023598,
"grad_norm": 1.666815065361009,
"learning_rate": 5.274279615795091e-06,
"loss": 0.68858194,
"memory(GiB)": 71.94,
"step": 2360,
"train_speed(iter/s)": 0.020055
},
{
"epoch": 0.9790933554129579,
"grad_norm": 1.6613287141536495,
"learning_rate": 5.263607257203842e-06,
"loss": 0.65968986,
"memory(GiB)": 71.94,
"step": 2365,
"train_speed(iter/s)": 0.020056
},
{
"epoch": 0.9811633202235562,
"grad_norm": 1.5579649689164343,
"learning_rate": 5.252934898612594e-06,
"loss": 0.69255476,
"memory(GiB)": 71.94,
"step": 2370,
"train_speed(iter/s)": 0.020057
},
{
"epoch": 0.9832332850341544,
"grad_norm": 1.7564735837589676,
"learning_rate": 5.242262540021346e-06,
"loss": 0.71395578,
"memory(GiB)": 71.94,
"step": 2375,
"train_speed(iter/s)": 0.020057
},
{
"epoch": 0.9853032498447526,
"grad_norm": 2.0952603393058076,
"learning_rate": 5.231590181430096e-06,
"loss": 0.69916964,
"memory(GiB)": 71.94,
"step": 2380,
"train_speed(iter/s)": 0.020058
},
{
"epoch": 0.9873732146553509,
"grad_norm": 1.851124802207451,
"learning_rate": 5.220917822838848e-06,
"loss": 0.6992053,
"memory(GiB)": 71.94,
"step": 2385,
"train_speed(iter/s)": 0.020059
},
{
"epoch": 0.9894431794659491,
"grad_norm": 1.977421778833197,
"learning_rate": 5.2102454642476e-06,
"loss": 0.70937605,
"memory(GiB)": 71.94,
"step": 2390,
"train_speed(iter/s)": 0.020059
},
{
"epoch": 0.9915131442765474,
"grad_norm": 1.8125821116129224,
"learning_rate": 5.19957310565635e-06,
"loss": 0.70189705,
"memory(GiB)": 71.94,
"step": 2395,
"train_speed(iter/s)": 0.02006
},
{
"epoch": 0.9935831090871455,
"grad_norm": 1.8812587805267227,
"learning_rate": 5.188900747065102e-06,
"loss": 0.68958111,
"memory(GiB)": 71.94,
"step": 2400,
"train_speed(iter/s)": 0.02006
},
{
"epoch": 0.9935831090871455,
"eval_loss": 0.812356173992157,
"eval_runtime": 333.5694,
"eval_samples_per_second": 18.725,
"eval_steps_per_second": 1.172,
"step": 2400
},
{
"epoch": 0.9956530738977437,
"grad_norm": 1.77525759977015,
"learning_rate": 5.178228388473853e-06,
"loss": 0.71421309,
"memory(GiB)": 71.94,
"step": 2405,
"train_speed(iter/s)": 0.019998
},
{
"epoch": 0.997723038708342,
"grad_norm": 1.9847604199741764,
"learning_rate": 5.167556029882604e-06,
"loss": 0.67816806,
"memory(GiB)": 71.94,
"step": 2410,
"train_speed(iter/s)": 0.019999
},
{
"epoch": 0.9997930035189402,
"grad_norm": 1.8831401901251863,
"learning_rate": 5.156883671291356e-06,
"loss": 0.67297964,
"memory(GiB)": 71.94,
"step": 2415,
"train_speed(iter/s)": 0.02
},
{
"epoch": 1.0018629683295384,
"grad_norm": 1.902600390439468,
"learning_rate": 5.146211312700107e-06,
"loss": 0.65476379,
"memory(GiB)": 71.94,
"step": 2420,
"train_speed(iter/s)": 0.020001
},
{
"epoch": 1.0039329331401365,
"grad_norm": 1.8958163712669238,
"learning_rate": 5.135538954108859e-06,
"loss": 0.69827843,
"memory(GiB)": 71.94,
"step": 2425,
"train_speed(iter/s)": 0.020002
},
{
"epoch": 1.006002897950735,
"grad_norm": 2.0893567670048774,
"learning_rate": 5.12486659551761e-06,
"loss": 0.6843009,
"memory(GiB)": 71.94,
"step": 2430,
"train_speed(iter/s)": 0.020004
},
{
"epoch": 1.0080728627613331,
"grad_norm": 1.7283831963515028,
"learning_rate": 5.114194236926361e-06,
"loss": 0.66953688,
"memory(GiB)": 71.94,
"step": 2435,
"train_speed(iter/s)": 0.020005
},
{
"epoch": 1.0101428275719313,
"grad_norm": 2.4550295961951023,
"learning_rate": 5.103521878335113e-06,
"loss": 0.6826447,
"memory(GiB)": 71.94,
"step": 2440,
"train_speed(iter/s)": 0.020006
},
{
"epoch": 1.0122127923825295,
"grad_norm": 2.065348792100011,
"learning_rate": 5.092849519743865e-06,
"loss": 0.71306543,
"memory(GiB)": 71.94,
"step": 2445,
"train_speed(iter/s)": 0.020006
},
{
"epoch": 1.0142827571931277,
"grad_norm": 1.8461278616269987,
"learning_rate": 5.082177161152615e-06,
"loss": 0.70268006,
"memory(GiB)": 71.94,
"step": 2450,
"train_speed(iter/s)": 0.020007
},
{
"epoch": 1.0163527220037258,
"grad_norm": 1.8474179385577107,
"learning_rate": 5.071504802561367e-06,
"loss": 0.68759146,
"memory(GiB)": 71.94,
"step": 2455,
"train_speed(iter/s)": 0.020008
},
{
"epoch": 1.0184226868143242,
"grad_norm": 2.1221766243412556,
"learning_rate": 5.060832443970117e-06,
"loss": 0.70161247,
"memory(GiB)": 71.94,
"step": 2460,
"train_speed(iter/s)": 0.020009
},
{
"epoch": 1.0204926516249224,
"grad_norm": 1.7992664488684018,
"learning_rate": 5.050160085378869e-06,
"loss": 0.70210953,
"memory(GiB)": 71.94,
"step": 2465,
"train_speed(iter/s)": 0.02001
},
{
"epoch": 1.0225626164355206,
"grad_norm": 2.010160051422228,
"learning_rate": 5.039487726787621e-06,
"loss": 0.70377779,
"memory(GiB)": 71.94,
"step": 2470,
"train_speed(iter/s)": 0.020011
},
{
"epoch": 1.0246325812461188,
"grad_norm": 1.677895969704363,
"learning_rate": 5.028815368196372e-06,
"loss": 0.71084547,
"memory(GiB)": 71.94,
"step": 2475,
"train_speed(iter/s)": 0.020012
},
{
"epoch": 1.026702546056717,
"grad_norm": 1.9057688334203953,
"learning_rate": 5.018143009605123e-06,
"loss": 0.67874904,
"memory(GiB)": 71.94,
"step": 2480,
"train_speed(iter/s)": 0.020013
},
{
"epoch": 1.0287725108673154,
"grad_norm": 1.826208938410727,
"learning_rate": 5.007470651013875e-06,
"loss": 0.67107067,
"memory(GiB)": 71.94,
"step": 2485,
"train_speed(iter/s)": 0.020014
},
{
"epoch": 1.0308424756779135,
"grad_norm": 2.0686042207681425,
"learning_rate": 4.996798292422626e-06,
"loss": 0.70525031,
"memory(GiB)": 71.94,
"step": 2490,
"train_speed(iter/s)": 0.020016
},
{
"epoch": 1.0329124404885117,
"grad_norm": 1.7733232551181717,
"learning_rate": 4.986125933831378e-06,
"loss": 0.68992233,
"memory(GiB)": 71.94,
"step": 2495,
"train_speed(iter/s)": 0.020017
},
{
"epoch": 1.03498240529911,
"grad_norm": 1.7950220673083168,
"learning_rate": 4.975453575240129e-06,
"loss": 0.67175922,
"memory(GiB)": 71.94,
"step": 2500,
"train_speed(iter/s)": 0.020017
},
{
"epoch": 1.037052370109708,
"grad_norm": 1.9232030990454307,
"learning_rate": 4.96478121664888e-06,
"loss": 0.70407228,
"memory(GiB)": 71.94,
"step": 2505,
"train_speed(iter/s)": 0.020018
},
{
"epoch": 1.0391223349203063,
"grad_norm": 1.708121386060437,
"learning_rate": 4.9541088580576316e-06,
"loss": 0.70078506,
"memory(GiB)": 71.94,
"step": 2510,
"train_speed(iter/s)": 0.020018
},
{
"epoch": 1.0411922997309047,
"grad_norm": 1.5750528842945233,
"learning_rate": 4.943436499466383e-06,
"loss": 0.66830945,
"memory(GiB)": 71.94,
"step": 2515,
"train_speed(iter/s)": 0.020019
},
{
"epoch": 1.0432622645415028,
"grad_norm": 1.8181518834205428,
"learning_rate": 4.932764140875134e-06,
"loss": 0.69259553,
"memory(GiB)": 71.94,
"step": 2520,
"train_speed(iter/s)": 0.020019
},
{
"epoch": 1.045332229352101,
"grad_norm": 1.551131375424082,
"learning_rate": 4.9220917822838855e-06,
"loss": 0.69300728,
"memory(GiB)": 71.94,
"step": 2525,
"train_speed(iter/s)": 0.02002
},
{
"epoch": 1.0474021941626992,
"grad_norm": 1.9636419108578083,
"learning_rate": 4.9114194236926366e-06,
"loss": 0.69065495,
"memory(GiB)": 71.94,
"step": 2530,
"train_speed(iter/s)": 0.020021
},
{
"epoch": 1.0494721589732974,
"grad_norm": 2.2096885596291918,
"learning_rate": 4.900747065101388e-06,
"loss": 0.72614369,
"memory(GiB)": 71.94,
"step": 2535,
"train_speed(iter/s)": 0.020022
},
{
"epoch": 1.0515421237838958,
"grad_norm": 1.8198162773587976,
"learning_rate": 4.890074706510139e-06,
"loss": 0.68779688,
"memory(GiB)": 71.94,
"step": 2540,
"train_speed(iter/s)": 0.020023
},
{
"epoch": 1.053612088594494,
"grad_norm": 1.8767827748337365,
"learning_rate": 4.8794023479188905e-06,
"loss": 0.65902538,
"memory(GiB)": 71.94,
"step": 2545,
"train_speed(iter/s)": 0.020024
},
{
"epoch": 1.0556820534050921,
"grad_norm": 1.774731724427948,
"learning_rate": 4.8687299893276416e-06,
"loss": 0.6885426,
"memory(GiB)": 71.94,
"step": 2550,
"train_speed(iter/s)": 0.020025
},
{
"epoch": 1.0577520182156903,
"grad_norm": 1.7292661960079105,
"learning_rate": 4.858057630736393e-06,
"loss": 0.67627816,
"memory(GiB)": 71.94,
"step": 2555,
"train_speed(iter/s)": 0.020026
},
{
"epoch": 1.0598219830262885,
"grad_norm": 2.022917994019762,
"learning_rate": 4.8473852721451445e-06,
"loss": 0.69524202,
"memory(GiB)": 71.94,
"step": 2560,
"train_speed(iter/s)": 0.020027
},
{
"epoch": 1.0618919478368867,
"grad_norm": 1.7503339393851076,
"learning_rate": 4.8367129135538955e-06,
"loss": 0.69861703,
"memory(GiB)": 71.94,
"step": 2565,
"train_speed(iter/s)": 0.020028
},
{
"epoch": 1.063961912647485,
"grad_norm": 2.3241272289849126,
"learning_rate": 4.826040554962647e-06,
"loss": 0.69416137,
"memory(GiB)": 71.94,
"step": 2570,
"train_speed(iter/s)": 0.020029
},
{
"epoch": 1.0660318774580833,
"grad_norm": 1.9145774620065716,
"learning_rate": 4.8153681963713985e-06,
"loss": 0.707304,
"memory(GiB)": 71.94,
"step": 2575,
"train_speed(iter/s)": 0.02003
},
{
"epoch": 1.0681018422686814,
"grad_norm": 2.2205440096454363,
"learning_rate": 4.80469583778015e-06,
"loss": 0.68690495,
"memory(GiB)": 71.94,
"step": 2580,
"train_speed(iter/s)": 0.02003
},
{
"epoch": 1.0701718070792796,
"grad_norm": 1.907681817748529,
"learning_rate": 4.794023479188901e-06,
"loss": 0.67836595,
"memory(GiB)": 71.94,
"step": 2585,
"train_speed(iter/s)": 0.020032
},
{
"epoch": 1.0722417718898778,
"grad_norm": 1.812113935386319,
"learning_rate": 4.783351120597652e-06,
"loss": 0.69949398,
"memory(GiB)": 71.94,
"step": 2590,
"train_speed(iter/s)": 0.020032
},
{
"epoch": 1.0743117367004762,
"grad_norm": 1.9890056601624129,
"learning_rate": 4.7726787620064035e-06,
"loss": 0.68311234,
"memory(GiB)": 71.94,
"step": 2595,
"train_speed(iter/s)": 0.020032
},
{
"epoch": 1.0763817015110744,
"grad_norm": 1.6389803143854558,
"learning_rate": 4.762006403415155e-06,
"loss": 0.68783474,
"memory(GiB)": 71.94,
"step": 2600,
"train_speed(iter/s)": 0.020034
},
{
"epoch": 1.0784516663216726,
"grad_norm": 1.7243055433624384,
"learning_rate": 4.751334044823906e-06,
"loss": 0.68109202,
"memory(GiB)": 71.94,
"step": 2605,
"train_speed(iter/s)": 0.020035
},
{
"epoch": 1.0805216311322707,
"grad_norm": 2.0564605414653356,
"learning_rate": 4.740661686232657e-06,
"loss": 0.67778783,
"memory(GiB)": 71.94,
"step": 2610,
"train_speed(iter/s)": 0.020036
},
{
"epoch": 1.082591595942869,
"grad_norm": 1.9673716530688552,
"learning_rate": 4.729989327641409e-06,
"loss": 0.67061253,
"memory(GiB)": 71.94,
"step": 2615,
"train_speed(iter/s)": 0.020037
},
{
"epoch": 1.084661560753467,
"grad_norm": 2.256366938059263,
"learning_rate": 4.71931696905016e-06,
"loss": 0.6971777,
"memory(GiB)": 71.94,
"step": 2620,
"train_speed(iter/s)": 0.020038
},
{
"epoch": 1.0867315255640655,
"grad_norm": 2.092500467589189,
"learning_rate": 4.708644610458911e-06,
"loss": 0.67864285,
"memory(GiB)": 71.94,
"step": 2625,
"train_speed(iter/s)": 0.020039
},
{
"epoch": 1.0888014903746637,
"grad_norm": 1.7063119566002638,
"learning_rate": 4.697972251867663e-06,
"loss": 0.66528416,
"memory(GiB)": 71.94,
"step": 2630,
"train_speed(iter/s)": 0.02004
},
{
"epoch": 1.0908714551852619,
"grad_norm": 1.7825515797938636,
"learning_rate": 4.687299893276414e-06,
"loss": 0.67014618,
"memory(GiB)": 71.94,
"step": 2635,
"train_speed(iter/s)": 0.02004
},
{
"epoch": 1.09294141999586,
"grad_norm": 1.7034629950636981,
"learning_rate": 4.676627534685166e-06,
"loss": 0.67851248,
"memory(GiB)": 71.94,
"step": 2640,
"train_speed(iter/s)": 0.020041
},
{
"epoch": 1.0950113848064582,
"grad_norm": 1.6535356357438644,
"learning_rate": 4.665955176093917e-06,
"loss": 0.69617391,
"memory(GiB)": 71.94,
"step": 2645,
"train_speed(iter/s)": 0.020042
},
{
"epoch": 1.0970813496170566,
"grad_norm": 1.790083800076922,
"learning_rate": 4.655282817502668e-06,
"loss": 0.68771133,
"memory(GiB)": 71.94,
"step": 2650,
"train_speed(iter/s)": 0.020042
},
{
"epoch": 1.0991513144276548,
"grad_norm": 1.9625451394764908,
"learning_rate": 4.64461045891142e-06,
"loss": 0.68705645,
"memory(GiB)": 71.94,
"step": 2655,
"train_speed(iter/s)": 0.020043
},
{
"epoch": 1.101221279238253,
"grad_norm": 2.41883263144616,
"learning_rate": 4.633938100320171e-06,
"loss": 0.66960602,
"memory(GiB)": 71.94,
"step": 2660,
"train_speed(iter/s)": 0.020044
},
{
"epoch": 1.1032912440488511,
"grad_norm": 1.6833879907399165,
"learning_rate": 4.623265741728922e-06,
"loss": 0.67477846,
"memory(GiB)": 71.94,
"step": 2665,
"train_speed(iter/s)": 0.020045
},
{
"epoch": 1.1053612088594493,
"grad_norm": 1.84460802825979,
"learning_rate": 4.612593383137674e-06,
"loss": 0.66959124,
"memory(GiB)": 71.94,
"step": 2670,
"train_speed(iter/s)": 0.020046
},
{
"epoch": 1.1074311736700475,
"grad_norm": 1.8226080355090242,
"learning_rate": 4.601921024546425e-06,
"loss": 0.66996727,
"memory(GiB)": 71.94,
"step": 2675,
"train_speed(iter/s)": 0.020047
},
{
"epoch": 1.109501138480646,
"grad_norm": 1.9273809027924724,
"learning_rate": 4.591248665955176e-06,
"loss": 0.69185095,
"memory(GiB)": 71.94,
"step": 2680,
"train_speed(iter/s)": 0.020048
},
{
"epoch": 1.111571103291244,
"grad_norm": 2.2322742496415517,
"learning_rate": 4.580576307363927e-06,
"loss": 0.70724788,
"memory(GiB)": 71.94,
"step": 2685,
"train_speed(iter/s)": 0.020049
},
{
"epoch": 1.1136410681018423,
"grad_norm": 2.1881226444266133,
"learning_rate": 4.569903948772679e-06,
"loss": 0.67986469,
"memory(GiB)": 71.94,
"step": 2690,
"train_speed(iter/s)": 0.020049
},
{
"epoch": 1.1157110329124404,
"grad_norm": 1.9344503535315112,
"learning_rate": 4.55923159018143e-06,
"loss": 0.6932023,
"memory(GiB)": 71.94,
"step": 2695,
"train_speed(iter/s)": 0.02005
},
{
"epoch": 1.1177809977230386,
"grad_norm": 2.0197719509697056,
"learning_rate": 4.548559231590182e-06,
"loss": 0.67475772,
"memory(GiB)": 71.94,
"step": 2700,
"train_speed(iter/s)": 0.020051
},
{
"epoch": 1.1177809977230386,
"eval_loss": 0.8077359795570374,
"eval_runtime": 332.9193,
"eval_samples_per_second": 18.761,
"eval_steps_per_second": 1.174,
"step": 2700
},
{
"epoch": 1.119850962533637,
"grad_norm": 1.8305900241415576,
"learning_rate": 4.537886872998933e-06,
"loss": 0.71444244,
"memory(GiB)": 71.94,
"step": 2705,
"train_speed(iter/s)": 0.019995
},
{
"epoch": 1.1219209273442352,
"grad_norm": 1.9866747607495971,
"learning_rate": 4.527214514407685e-06,
"loss": 0.68752618,
"memory(GiB)": 71.94,
"step": 2710,
"train_speed(iter/s)": 0.019996
},
{
"epoch": 1.1239908921548334,
"grad_norm": 1.8866009316885002,
"learning_rate": 4.516542155816436e-06,
"loss": 0.67673769,
"memory(GiB)": 71.94,
"step": 2715,
"train_speed(iter/s)": 0.019997
},
{
"epoch": 1.1260608569654316,
"grad_norm": 1.8693665648036668,
"learning_rate": 4.505869797225187e-06,
"loss": 0.68302212,
"memory(GiB)": 71.94,
"step": 2720,
"train_speed(iter/s)": 0.019997
},
{
"epoch": 1.1281308217760297,
"grad_norm": 2.088485544417028,
"learning_rate": 4.495197438633939e-06,
"loss": 0.69513187,
"memory(GiB)": 71.94,
"step": 2725,
"train_speed(iter/s)": 0.019999
},
{
"epoch": 1.1302007865866281,
"grad_norm": 1.83149183273408,
"learning_rate": 4.48452508004269e-06,
"loss": 0.67474871,
"memory(GiB)": 71.94,
"step": 2730,
"train_speed(iter/s)": 0.02
},
{
"epoch": 1.1322707513972263,
"grad_norm": 1.9681727198174188,
"learning_rate": 4.473852721451441e-06,
"loss": 0.6727396,
"memory(GiB)": 71.94,
"step": 2735,
"train_speed(iter/s)": 0.020001
},
{
"epoch": 1.1343407162078245,
"grad_norm": 3.0386548884501288,
"learning_rate": 4.463180362860193e-06,
"loss": 0.69408131,
"memory(GiB)": 71.94,
"step": 2740,
"train_speed(iter/s)": 0.020002
},
{
"epoch": 1.1364106810184227,
"grad_norm": 3.156158661144904,
"learning_rate": 4.452508004268944e-06,
"loss": 0.6340518,
"memory(GiB)": 71.94,
"step": 2745,
"train_speed(iter/s)": 0.020003
},
{
"epoch": 1.1384806458290209,
"grad_norm": 2.005947464562104,
"learning_rate": 4.441835645677695e-06,
"loss": 0.68589301,
"memory(GiB)": 71.94,
"step": 2750,
"train_speed(iter/s)": 0.020004
},
{
"epoch": 1.140550610639619,
"grad_norm": 1.8910732500138312,
"learning_rate": 4.431163287086446e-06,
"loss": 0.68320475,
"memory(GiB)": 71.94,
"step": 2755,
"train_speed(iter/s)": 0.020005
},
{
"epoch": 1.1426205754502172,
"grad_norm": 1.5980061705957451,
"learning_rate": 4.420490928495198e-06,
"loss": 0.68288679,
"memory(GiB)": 71.94,
"step": 2760,
"train_speed(iter/s)": 0.020006
},
{
"epoch": 1.1446905402608156,
"grad_norm": 2.1893217377555367,
"learning_rate": 4.409818569903949e-06,
"loss": 0.67679596,
"memory(GiB)": 71.94,
"step": 2765,
"train_speed(iter/s)": 0.020007
},
{
"epoch": 1.1467605050714138,
"grad_norm": 1.6144531687050387,
"learning_rate": 4.3991462113127e-06,
"loss": 0.67341776,
"memory(GiB)": 71.94,
"step": 2770,
"train_speed(iter/s)": 0.020008
},
{
"epoch": 1.148830469882012,
"grad_norm": 1.7848487108948465,
"learning_rate": 4.388473852721452e-06,
"loss": 0.65265465,
"memory(GiB)": 71.94,
"step": 2775,
"train_speed(iter/s)": 0.020008
},
{
"epoch": 1.1509004346926102,
"grad_norm": 1.9198684626115081,
"learning_rate": 4.377801494130203e-06,
"loss": 0.65432949,
"memory(GiB)": 71.94,
"step": 2780,
"train_speed(iter/s)": 0.02001
},
{
"epoch": 1.1529703995032086,
"grad_norm": 1.8075401444295365,
"learning_rate": 4.367129135538955e-06,
"loss": 0.66166754,
"memory(GiB)": 71.94,
"step": 2785,
"train_speed(iter/s)": 0.020011
},
{
"epoch": 1.1550403643138067,
"grad_norm": 2.093007547124246,
"learning_rate": 4.356456776947706e-06,
"loss": 0.671489,
"memory(GiB)": 71.94,
"step": 2790,
"train_speed(iter/s)": 0.020011
},
{
"epoch": 1.157110329124405,
"grad_norm": 2.155911339314564,
"learning_rate": 4.345784418356458e-06,
"loss": 0.66719484,
"memory(GiB)": 71.94,
"step": 2795,
"train_speed(iter/s)": 0.020013
},
{
"epoch": 1.159180293935003,
"grad_norm": 1.7929721304219823,
"learning_rate": 4.335112059765209e-06,
"loss": 0.65916939,
"memory(GiB)": 71.94,
"step": 2800,
"train_speed(iter/s)": 0.020014
},
{
"epoch": 1.1612502587456013,
"grad_norm": 2.3382800828112695,
"learning_rate": 4.32443970117396e-06,
"loss": 0.68535299,
"memory(GiB)": 71.94,
"step": 2805,
"train_speed(iter/s)": 0.020014
},
{
"epoch": 1.1633202235561995,
"grad_norm": 1.9393370355158424,
"learning_rate": 4.313767342582711e-06,
"loss": 0.68975515,
"memory(GiB)": 71.94,
"step": 2810,
"train_speed(iter/s)": 0.020015
},
{
"epoch": 1.1653901883667976,
"grad_norm": 2.1161711862572172,
"learning_rate": 4.303094983991463e-06,
"loss": 0.68595347,
"memory(GiB)": 71.94,
"step": 2815,
"train_speed(iter/s)": 0.020016
},
{
"epoch": 1.167460153177396,
"grad_norm": 1.89196514815735,
"learning_rate": 4.292422625400214e-06,
"loss": 0.66012936,
"memory(GiB)": 71.94,
"step": 2820,
"train_speed(iter/s)": 0.020018
},
{
"epoch": 1.1695301179879942,
"grad_norm": 1.8601131110854523,
"learning_rate": 4.281750266808965e-06,
"loss": 0.68001904,
"memory(GiB)": 71.94,
"step": 2825,
"train_speed(iter/s)": 0.020019
},
{
"epoch": 1.1716000827985924,
"grad_norm": 1.8930363611249428,
"learning_rate": 4.271077908217717e-06,
"loss": 0.66193466,
"memory(GiB)": 71.94,
"step": 2830,
"train_speed(iter/s)": 0.02002
},
{
"epoch": 1.1736700476091906,
"grad_norm": 1.651757107446397,
"learning_rate": 4.260405549626468e-06,
"loss": 0.67280817,
"memory(GiB)": 71.94,
"step": 2835,
"train_speed(iter/s)": 0.020021
},
{
"epoch": 1.175740012419789,
"grad_norm": 1.6748291349437752,
"learning_rate": 4.249733191035219e-06,
"loss": 0.70100274,
"memory(GiB)": 71.94,
"step": 2840,
"train_speed(iter/s)": 0.020022
},
{
"epoch": 1.1778099772303872,
"grad_norm": 1.7834078724205271,
"learning_rate": 4.239060832443971e-06,
"loss": 0.64067845,
"memory(GiB)": 71.94,
"step": 2845,
"train_speed(iter/s)": 0.020023
},
{
"epoch": 1.1798799420409853,
"grad_norm": 1.9357492367842137,
"learning_rate": 4.228388473852722e-06,
"loss": 0.68998647,
"memory(GiB)": 71.94,
"step": 2850,
"train_speed(iter/s)": 0.020024
},
{
"epoch": 1.1819499068515835,
"grad_norm": 1.9186551723129406,
"learning_rate": 4.2177161152614736e-06,
"loss": 0.67889709,
"memory(GiB)": 71.94,
"step": 2855,
"train_speed(iter/s)": 0.020023
},
{
"epoch": 1.1840198716621817,
"grad_norm": 1.9166194791943714,
"learning_rate": 4.207043756670225e-06,
"loss": 0.67329493,
"memory(GiB)": 71.94,
"step": 2860,
"train_speed(iter/s)": 0.020024
},
{
"epoch": 1.1860898364727799,
"grad_norm": 1.9439517613212347,
"learning_rate": 4.196371398078976e-06,
"loss": 0.6614254,
"memory(GiB)": 71.94,
"step": 2865,
"train_speed(iter/s)": 0.020026
},
{
"epoch": 1.188159801283378,
"grad_norm": 1.6420722807797328,
"learning_rate": 4.1856990394877275e-06,
"loss": 0.67023277,
"memory(GiB)": 71.94,
"step": 2870,
"train_speed(iter/s)": 0.020026
},
{
"epoch": 1.1902297660939765,
"grad_norm": 1.7445706716402636,
"learning_rate": 4.1750266808964786e-06,
"loss": 0.67275252,
"memory(GiB)": 71.94,
"step": 2875,
"train_speed(iter/s)": 0.020027
},
{
"epoch": 1.1922997309045746,
"grad_norm": 1.8273002856280824,
"learning_rate": 4.16435432230523e-06,
"loss": 0.65841031,
"memory(GiB)": 71.94,
"step": 2880,
"train_speed(iter/s)": 0.020028
},
{
"epoch": 1.1943696957151728,
"grad_norm": 1.7392597436189736,
"learning_rate": 4.1536819637139815e-06,
"loss": 0.66225605,
"memory(GiB)": 71.94,
"step": 2885,
"train_speed(iter/s)": 0.020029
},
{
"epoch": 1.196439660525771,
"grad_norm": 1.639200530922351,
"learning_rate": 4.1430096051227325e-06,
"loss": 0.65032516,
"memory(GiB)": 71.94,
"step": 2890,
"train_speed(iter/s)": 0.02003
},
{
"epoch": 1.1985096253363694,
"grad_norm": 1.8960982589133293,
"learning_rate": 4.1323372465314836e-06,
"loss": 0.69994841,
"memory(GiB)": 71.94,
"step": 2895,
"train_speed(iter/s)": 0.020031
},
{
"epoch": 1.2005795901469676,
"grad_norm": 2.1518082070491613,
"learning_rate": 4.121664887940235e-06,
"loss": 0.64357581,
"memory(GiB)": 71.94,
"step": 2900,
"train_speed(iter/s)": 0.020031
},
{
"epoch": 1.2026495549575658,
"grad_norm": 2.450141804935637,
"learning_rate": 4.1109925293489865e-06,
"loss": 0.68345547,
"memory(GiB)": 71.94,
"step": 2905,
"train_speed(iter/s)": 0.020032
},
{
"epoch": 1.204719519768164,
"grad_norm": 1.8681725857258895,
"learning_rate": 4.1003201707577375e-06,
"loss": 0.66512461,
"memory(GiB)": 71.94,
"step": 2910,
"train_speed(iter/s)": 0.020033
},
{
"epoch": 1.2067894845787621,
"grad_norm": 2.2592923799668774,
"learning_rate": 4.089647812166489e-06,
"loss": 0.68196335,
"memory(GiB)": 71.94,
"step": 2915,
"train_speed(iter/s)": 0.020034
},
{
"epoch": 1.2088594493893603,
"grad_norm": 2.0216041168873775,
"learning_rate": 4.0789754535752404e-06,
"loss": 0.6808126,
"memory(GiB)": 71.94,
"step": 2920,
"train_speed(iter/s)": 0.020035
},
{
"epoch": 1.2109294141999585,
"grad_norm": 1.7659468605949793,
"learning_rate": 4.0683030949839915e-06,
"loss": 0.64283953,
"memory(GiB)": 71.94,
"step": 2925,
"train_speed(iter/s)": 0.020037
},
{
"epoch": 1.2129993790105569,
"grad_norm": 1.970065027416205,
"learning_rate": 4.057630736392743e-06,
"loss": 0.6555728,
"memory(GiB)": 71.94,
"step": 2930,
"train_speed(iter/s)": 0.020038
},
{
"epoch": 1.215069343821155,
"grad_norm": 1.998737392424493,
"learning_rate": 4.046958377801494e-06,
"loss": 0.6660428,
"memory(GiB)": 71.94,
"step": 2935,
"train_speed(iter/s)": 0.020039
},
{
"epoch": 1.2171393086317532,
"grad_norm": 2.1589153805149173,
"learning_rate": 4.036286019210246e-06,
"loss": 0.65805073,
"memory(GiB)": 71.94,
"step": 2940,
"train_speed(iter/s)": 0.02004
},
{
"epoch": 1.2192092734423514,
"grad_norm": 1.990247509684529,
"learning_rate": 4.025613660618997e-06,
"loss": 0.6413794,
"memory(GiB)": 71.94,
"step": 2945,
"train_speed(iter/s)": 0.020041
},
{
"epoch": 1.2212792382529498,
"grad_norm": 2.0438116812415625,
"learning_rate": 4.014941302027748e-06,
"loss": 0.65000477,
"memory(GiB)": 71.94,
"step": 2950,
"train_speed(iter/s)": 0.020042
},
{
"epoch": 1.223349203063548,
"grad_norm": 1.6608044995599232,
"learning_rate": 4.004268943436499e-06,
"loss": 0.65562267,
"memory(GiB)": 71.94,
"step": 2955,
"train_speed(iter/s)": 0.020042
},
{
"epoch": 1.2254191678741462,
"grad_norm": 1.9291845395844707,
"learning_rate": 3.993596584845251e-06,
"loss": 0.67264051,
"memory(GiB)": 71.94,
"step": 2960,
"train_speed(iter/s)": 0.020043
},
{
"epoch": 1.2274891326847444,
"grad_norm": 2.3618922331006753,
"learning_rate": 3.982924226254002e-06,
"loss": 0.68117104,
"memory(GiB)": 71.94,
"step": 2965,
"train_speed(iter/s)": 0.020044
},
{
"epoch": 1.2295590974953425,
"grad_norm": 1.9561928806095756,
"learning_rate": 3.972251867662753e-06,
"loss": 0.6778089,
"memory(GiB)": 71.94,
"step": 2970,
"train_speed(iter/s)": 0.020045
},
{
"epoch": 1.2316290623059407,
"grad_norm": 2.103021206596317,
"learning_rate": 3.961579509071505e-06,
"loss": 0.66529841,
"memory(GiB)": 71.94,
"step": 2975,
"train_speed(iter/s)": 0.020046
},
{
"epoch": 1.2336990271165391,
"grad_norm": 1.6317138464756236,
"learning_rate": 3.950907150480256e-06,
"loss": 0.63005896,
"memory(GiB)": 71.94,
"step": 2980,
"train_speed(iter/s)": 0.020047
},
{
"epoch": 1.2357689919271373,
"grad_norm": 1.8763154321083348,
"learning_rate": 3.940234791889007e-06,
"loss": 0.69782147,
"memory(GiB)": 71.94,
"step": 2985,
"train_speed(iter/s)": 0.020048
},
{
"epoch": 1.2378389567377355,
"grad_norm": 2.12720513001939,
"learning_rate": 3.929562433297759e-06,
"loss": 0.67128716,
"memory(GiB)": 71.94,
"step": 2990,
"train_speed(iter/s)": 0.020049
},
{
"epoch": 1.2399089215483337,
"grad_norm": 2.0560360745042243,
"learning_rate": 3.91889007470651e-06,
"loss": 0.66187463,
"memory(GiB)": 71.94,
"step": 2995,
"train_speed(iter/s)": 0.02005
},
{
"epoch": 1.2419788863589318,
"grad_norm": 1.9884557277874149,
"learning_rate": 3.908217716115262e-06,
"loss": 0.68540969,
"memory(GiB)": 71.94,
"step": 3000,
"train_speed(iter/s)": 0.020051
},
{
"epoch": 1.2419788863589318,
"eval_loss": 0.8033931255340576,
"eval_runtime": 334.3532,
"eval_samples_per_second": 18.681,
"eval_steps_per_second": 1.169,
"step": 3000
},
{
"epoch": 1.2440488511695302,
"grad_norm": 1.8132336089456924,
"learning_rate": 3.897545357524013e-06,
"loss": 0.65594606,
"memory(GiB)": 71.94,
"step": 3005,
"train_speed(iter/s)": 0.020001
},
{
"epoch": 1.2461188159801284,
"grad_norm": 1.6982582415846914,
"learning_rate": 3.886872998932765e-06,
"loss": 0.65645032,
"memory(GiB)": 71.94,
"step": 3010,
"train_speed(iter/s)": 0.020003
},
{
"epoch": 1.2481887807907266,
"grad_norm": 1.9942609739954071,
"learning_rate": 3.876200640341516e-06,
"loss": 0.66572218,
"memory(GiB)": 71.94,
"step": 3015,
"train_speed(iter/s)": 0.020004
},
{
"epoch": 1.2502587456013248,
"grad_norm": 1.7952303972411592,
"learning_rate": 3.865528281750267e-06,
"loss": 0.64628544,
"memory(GiB)": 71.94,
"step": 3020,
"train_speed(iter/s)": 0.020005
},
{
"epoch": 1.252328710411923,
"grad_norm": 1.9058576079441742,
"learning_rate": 3.854855923159018e-06,
"loss": 0.68286681,
"memory(GiB)": 71.94,
"step": 3025,
"train_speed(iter/s)": 0.020005
},
{
"epoch": 1.2543986752225211,
"grad_norm": 2.4990275874354357,
"learning_rate": 3.84418356456777e-06,
"loss": 0.67756252,
"memory(GiB)": 71.94,
"step": 3030,
"train_speed(iter/s)": 0.020006
},
{
"epoch": 1.2564686400331193,
"grad_norm": 2.18800214487155,
"learning_rate": 3.833511205976521e-06,
"loss": 0.65295424,
"memory(GiB)": 71.94,
"step": 3035,
"train_speed(iter/s)": 0.020006
},
{
"epoch": 1.2585386048437177,
"grad_norm": 2.1525831686251737,
"learning_rate": 3.822838847385272e-06,
"loss": 0.66122398,
"memory(GiB)": 71.94,
"step": 3040,
"train_speed(iter/s)": 0.020007
},
{
"epoch": 1.260608569654316,
"grad_norm": 2.1001976709383374,
"learning_rate": 3.812166488794024e-06,
"loss": 0.64958653,
"memory(GiB)": 71.94,
"step": 3045,
"train_speed(iter/s)": 0.020008
},
{
"epoch": 1.262678534464914,
"grad_norm": 1.8005403174974608,
"learning_rate": 3.801494130202775e-06,
"loss": 0.66733065,
"memory(GiB)": 71.94,
"step": 3050,
"train_speed(iter/s)": 0.020009
},
{
"epoch": 1.2647484992755123,
"grad_norm": 2.0392708573153255,
"learning_rate": 3.7908217716115265e-06,
"loss": 0.65150566,
"memory(GiB)": 71.94,
"step": 3055,
"train_speed(iter/s)": 0.02001
},
{
"epoch": 1.2668184640861107,
"grad_norm": 1.9818256506070209,
"learning_rate": 3.7801494130202776e-06,
"loss": 0.65937147,
"memory(GiB)": 71.94,
"step": 3060,
"train_speed(iter/s)": 0.02001
},
{
"epoch": 1.2688884288967088,
"grad_norm": 2.091473795655044,
"learning_rate": 3.7694770544290294e-06,
"loss": 0.65645385,
"memory(GiB)": 71.94,
"step": 3065,
"train_speed(iter/s)": 0.020011
},
{
"epoch": 1.270958393707307,
"grad_norm": 1.9887693066644563,
"learning_rate": 3.7588046958377805e-06,
"loss": 0.6356863,
"memory(GiB)": 71.94,
"step": 3070,
"train_speed(iter/s)": 0.020013
},
{
"epoch": 1.2730283585179052,
"grad_norm": 1.849373670863495,
"learning_rate": 3.7481323372465315e-06,
"loss": 0.64913082,
"memory(GiB)": 71.94,
"step": 3075,
"train_speed(iter/s)": 0.020013
},
{
"epoch": 1.2750983233285034,
"grad_norm": 2.8796661699092776,
"learning_rate": 3.737459978655283e-06,
"loss": 0.66612153,
"memory(GiB)": 71.94,
"step": 3080,
"train_speed(iter/s)": 0.020015
},
{
"epoch": 1.2771682881391015,
"grad_norm": 1.9206629854919683,
"learning_rate": 3.7267876200640345e-06,
"loss": 0.65432153,
"memory(GiB)": 71.94,
"step": 3085,
"train_speed(iter/s)": 0.020016
},
{
"epoch": 1.2792382529496997,
"grad_norm": 2.0649234898157562,
"learning_rate": 3.716115261472786e-06,
"loss": 0.63801923,
"memory(GiB)": 71.94,
"step": 3090,
"train_speed(iter/s)": 0.020016
},
{
"epoch": 1.2813082177602981,
"grad_norm": 1.859875670695032,
"learning_rate": 3.705442902881537e-06,
"loss": 0.6548945,
"memory(GiB)": 71.94,
"step": 3095,
"train_speed(iter/s)": 0.020017
},
{
"epoch": 1.2833781825708963,
"grad_norm": 1.767661952337357,
"learning_rate": 3.694770544290289e-06,
"loss": 0.64956121,
"memory(GiB)": 71.94,
"step": 3100,
"train_speed(iter/s)": 0.020018
},
{
"epoch": 1.2854481473814945,
"grad_norm": 1.8007833500981838,
"learning_rate": 3.68409818569904e-06,
"loss": 0.669453,
"memory(GiB)": 71.94,
"step": 3105,
"train_speed(iter/s)": 0.020019
},
{
"epoch": 1.2875181121920927,
"grad_norm": 2.0099069002485863,
"learning_rate": 3.673425827107791e-06,
"loss": 0.67178354,
"memory(GiB)": 71.94,
"step": 3110,
"train_speed(iter/s)": 0.02002
},
{
"epoch": 1.289588077002691,
"grad_norm": 2.0463019564128886,
"learning_rate": 3.6627534685165424e-06,
"loss": 0.65595369,
"memory(GiB)": 71.94,
"step": 3115,
"train_speed(iter/s)": 0.02002
},
{
"epoch": 1.2916580418132892,
"grad_norm": 1.9152750413831356,
"learning_rate": 3.652081109925294e-06,
"loss": 0.65940399,
"memory(GiB)": 71.94,
"step": 3120,
"train_speed(iter/s)": 0.020021
},
{
"epoch": 1.2937280066238874,
"grad_norm": 2.9816950881544306,
"learning_rate": 3.6414087513340453e-06,
"loss": 0.66090908,
"memory(GiB)": 71.94,
"step": 3125,
"train_speed(iter/s)": 0.020022
},
{
"epoch": 1.2957979714344856,
"grad_norm": 1.8591409546980415,
"learning_rate": 3.6307363927427963e-06,
"loss": 0.66902189,
"memory(GiB)": 71.94,
"step": 3130,
"train_speed(iter/s)": 0.020023
},
{
"epoch": 1.2978679362450838,
"grad_norm": 2.078590180758969,
"learning_rate": 3.6200640341515482e-06,
"loss": 0.66616745,
"memory(GiB)": 71.94,
"step": 3135,
"train_speed(iter/s)": 0.020024
},
{
"epoch": 1.299937901055682,
"grad_norm": 1.8683848031966166,
"learning_rate": 3.6093916755602993e-06,
"loss": 0.64836683,
"memory(GiB)": 71.94,
"step": 3140,
"train_speed(iter/s)": 0.020025
},
{
"epoch": 1.3020078658662801,
"grad_norm": 1.7924123567589454,
"learning_rate": 3.5987193169690503e-06,
"loss": 0.66824627,
"memory(GiB)": 71.94,
"step": 3145,
"train_speed(iter/s)": 0.020026
},
{
"epoch": 1.3040778306768785,
"grad_norm": 1.94170729576761,
"learning_rate": 3.5880469583778018e-06,
"loss": 0.67116079,
"memory(GiB)": 71.94,
"step": 3150,
"train_speed(iter/s)": 0.020026
},
{
"epoch": 1.3061477954874767,
"grad_norm": 2.1208196521106357,
"learning_rate": 3.5773745997865532e-06,
"loss": 0.66398339,
"memory(GiB)": 71.94,
"step": 3155,
"train_speed(iter/s)": 0.020027
},
{
"epoch": 1.308217760298075,
"grad_norm": 1.94319742259963,
"learning_rate": 3.5667022411953047e-06,
"loss": 0.66830492,
"memory(GiB)": 71.94,
"step": 3160,
"train_speed(iter/s)": 0.020028
},
{
"epoch": 1.310287725108673,
"grad_norm": 1.8986704786653348,
"learning_rate": 3.5560298826040557e-06,
"loss": 0.64020205,
"memory(GiB)": 71.94,
"step": 3165,
"train_speed(iter/s)": 0.020028
},
{
"epoch": 1.3123576899192715,
"grad_norm": 1.832166033780513,
"learning_rate": 3.5453575240128068e-06,
"loss": 0.65297813,
"memory(GiB)": 71.94,
"step": 3170,
"train_speed(iter/s)": 0.020029
},
{
"epoch": 1.3144276547298697,
"grad_norm": 1.7524632564639653,
"learning_rate": 3.5346851654215586e-06,
"loss": 0.65833459,
"memory(GiB)": 71.94,
"step": 3175,
"train_speed(iter/s)": 0.020029
},
{
"epoch": 1.3164976195404678,
"grad_norm": 1.784819778645554,
"learning_rate": 3.5240128068303097e-06,
"loss": 0.65319304,
"memory(GiB)": 71.94,
"step": 3180,
"train_speed(iter/s)": 0.02003
},
{
"epoch": 1.318567584351066,
"grad_norm": 1.7883657805218875,
"learning_rate": 3.513340448239061e-06,
"loss": 0.65240812,
"memory(GiB)": 71.94,
"step": 3185,
"train_speed(iter/s)": 0.02003
},
{
"epoch": 1.3206375491616642,
"grad_norm": 1.894144860689911,
"learning_rate": 3.5026680896478126e-06,
"loss": 0.6723794,
"memory(GiB)": 71.94,
"step": 3190,
"train_speed(iter/s)": 0.020031
},
{
"epoch": 1.3227075139722624,
"grad_norm": 2.0320221849381195,
"learning_rate": 3.491995731056564e-06,
"loss": 0.64077559,
"memory(GiB)": 71.94,
"step": 3195,
"train_speed(iter/s)": 0.020032
},
{
"epoch": 1.3247774787828606,
"grad_norm": 1.727824740633838,
"learning_rate": 3.481323372465315e-06,
"loss": 0.63609524,
"memory(GiB)": 71.94,
"step": 3200,
"train_speed(iter/s)": 0.020033
},
{
"epoch": 1.326847443593459,
"grad_norm": 1.7454884742520516,
"learning_rate": 3.470651013874066e-06,
"loss": 0.65425596,
"memory(GiB)": 71.94,
"step": 3205,
"train_speed(iter/s)": 0.020033
},
{
"epoch": 1.3289174084040571,
"grad_norm": 1.8911130178984759,
"learning_rate": 3.459978655282818e-06,
"loss": 0.66199875,
"memory(GiB)": 71.94,
"step": 3210,
"train_speed(iter/s)": 0.020034
},
{
"epoch": 1.3309873732146553,
"grad_norm": 2.3250937309778474,
"learning_rate": 3.449306296691569e-06,
"loss": 0.65385923,
"memory(GiB)": 71.94,
"step": 3215,
"train_speed(iter/s)": 0.020035
},
{
"epoch": 1.3330573380252535,
"grad_norm": 2.1704453467921447,
"learning_rate": 3.43863393810032e-06,
"loss": 0.66229916,
"memory(GiB)": 71.94,
"step": 3220,
"train_speed(iter/s)": 0.020036
},
{
"epoch": 1.335127302835852,
"grad_norm": 2.2209117698054768,
"learning_rate": 3.427961579509072e-06,
"loss": 0.65051212,
"memory(GiB)": 71.94,
"step": 3225,
"train_speed(iter/s)": 0.020036
},
{
"epoch": 1.33719726764645,
"grad_norm": 1.6845784156480532,
"learning_rate": 3.417289220917823e-06,
"loss": 0.64576015,
"memory(GiB)": 71.94,
"step": 3230,
"train_speed(iter/s)": 0.020037
},
{
"epoch": 1.3392672324570483,
"grad_norm": 2.7721561024455332,
"learning_rate": 3.4066168623265745e-06,
"loss": 0.68257604,
"memory(GiB)": 71.94,
"step": 3235,
"train_speed(iter/s)": 0.020038
},
{
"epoch": 1.3413371972676464,
"grad_norm": 1.9362479296558788,
"learning_rate": 3.3959445037353255e-06,
"loss": 0.63483586,
"memory(GiB)": 71.94,
"step": 3240,
"train_speed(iter/s)": 0.020038
},
{
"epoch": 1.3434071620782446,
"grad_norm": 1.9956347191580364,
"learning_rate": 3.3852721451440774e-06,
"loss": 0.65959587,
"memory(GiB)": 71.94,
"step": 3245,
"train_speed(iter/s)": 0.020039
},
{
"epoch": 1.3454771268888428,
"grad_norm": 1.7321062675450865,
"learning_rate": 3.3745997865528285e-06,
"loss": 0.66381845,
"memory(GiB)": 71.94,
"step": 3250,
"train_speed(iter/s)": 0.02004
},
{
"epoch": 1.347547091699441,
"grad_norm": 1.9696403539230432,
"learning_rate": 3.3639274279615795e-06,
"loss": 0.64812822,
"memory(GiB)": 71.94,
"step": 3255,
"train_speed(iter/s)": 0.02004
},
{
"epoch": 1.3496170565100394,
"grad_norm": 2.1319401432279053,
"learning_rate": 3.353255069370331e-06,
"loss": 0.64005919,
"memory(GiB)": 71.94,
"step": 3260,
"train_speed(iter/s)": 0.020041
},
{
"epoch": 1.3516870213206376,
"grad_norm": 1.8363706763408283,
"learning_rate": 3.3425827107790824e-06,
"loss": 0.64201698,
"memory(GiB)": 71.94,
"step": 3265,
"train_speed(iter/s)": 0.020042
},
{
"epoch": 1.3537569861312357,
"grad_norm": 2.124129454637882,
"learning_rate": 3.331910352187834e-06,
"loss": 0.65025196,
"memory(GiB)": 71.94,
"step": 3270,
"train_speed(iter/s)": 0.020043
},
{
"epoch": 1.355826950941834,
"grad_norm": 1.7896864503844883,
"learning_rate": 3.321237993596585e-06,
"loss": 0.63300438,
"memory(GiB)": 71.94,
"step": 3275,
"train_speed(iter/s)": 0.020044
},
{
"epoch": 1.3578969157524323,
"grad_norm": 1.9837261215389441,
"learning_rate": 3.310565635005337e-06,
"loss": 0.63380709,
"memory(GiB)": 71.94,
"step": 3280,
"train_speed(iter/s)": 0.020044
},
{
"epoch": 1.3599668805630305,
"grad_norm": 2.097355543637799,
"learning_rate": 3.299893276414088e-06,
"loss": 0.62883258,
"memory(GiB)": 71.94,
"step": 3285,
"train_speed(iter/s)": 0.020044
},
{
"epoch": 1.3620368453736287,
"grad_norm": 1.9315070924851874,
"learning_rate": 3.289220917822839e-06,
"loss": 0.63195004,
"memory(GiB)": 71.94,
"step": 3290,
"train_speed(iter/s)": 0.020045
},
{
"epoch": 1.3641068101842269,
"grad_norm": 1.7050134771385488,
"learning_rate": 3.2785485592315903e-06,
"loss": 0.65333614,
"memory(GiB)": 71.94,
"step": 3295,
"train_speed(iter/s)": 0.020045
},
{
"epoch": 1.366176774994825,
"grad_norm": 3.6481119398862005,
"learning_rate": 3.267876200640342e-06,
"loss": 0.68793478,
"memory(GiB)": 71.94,
"step": 3300,
"train_speed(iter/s)": 0.020046
},
{
"epoch": 1.366176774994825,
"eval_loss": 0.8023556470870972,
"eval_runtime": 334.0894,
"eval_samples_per_second": 18.696,
"eval_steps_per_second": 1.17,
"step": 3300
},
{
"epoch": 1.3682467398054232,
"grad_norm": 2.092330422962737,
"learning_rate": 3.2572038420490933e-06,
"loss": 0.63313837,
"memory(GiB)": 71.94,
"step": 3305,
"train_speed(iter/s)": 0.02
},
{
"epoch": 1.3703167046160214,
"grad_norm": 1.9167879677265438,
"learning_rate": 3.2465314834578443e-06,
"loss": 0.65159397,
"memory(GiB)": 71.94,
"step": 3310,
"train_speed(iter/s)": 0.020001
},
{
"epoch": 1.3723866694266198,
"grad_norm": 1.8761921604953933,
"learning_rate": 3.235859124866596e-06,
"loss": 0.62655239,
"memory(GiB)": 71.94,
"step": 3315,
"train_speed(iter/s)": 0.020001
},
{
"epoch": 1.374456634237218,
"grad_norm": 2.178411039961288,
"learning_rate": 3.2251867662753472e-06,
"loss": 0.67218485,
"memory(GiB)": 71.94,
"step": 3320,
"train_speed(iter/s)": 0.020002
},
{
"epoch": 1.3765265990478162,
"grad_norm": 1.776609105623676,
"learning_rate": 3.2145144076840983e-06,
"loss": 0.63943739,
"memory(GiB)": 71.94,
"step": 3325,
"train_speed(iter/s)": 0.020003
},
{
"epoch": 1.3785965638584143,
"grad_norm": 1.8854963168848657,
"learning_rate": 3.2038420490928497e-06,
"loss": 0.65016818,
"memory(GiB)": 71.94,
"step": 3330,
"train_speed(iter/s)": 0.020003
},
{
"epoch": 1.3806665286690127,
"grad_norm": 2.3494051096524915,
"learning_rate": 3.193169690501601e-06,
"loss": 0.6696517,
"memory(GiB)": 71.94,
"step": 3335,
"train_speed(iter/s)": 0.020004
},
{
"epoch": 1.382736493479611,
"grad_norm": 2.390390518794705,
"learning_rate": 3.1824973319103527e-06,
"loss": 0.64389553,
"memory(GiB)": 71.94,
"step": 3340,
"train_speed(iter/s)": 0.020005
},
{
"epoch": 1.384806458290209,
"grad_norm": 1.8195738414381504,
"learning_rate": 3.1718249733191037e-06,
"loss": 0.64606419,
"memory(GiB)": 71.94,
"step": 3345,
"train_speed(iter/s)": 0.020006
},
{
"epoch": 1.3868764231008073,
"grad_norm": 2.3772901728441846,
"learning_rate": 3.1611526147278547e-06,
"loss": 0.66564407,
"memory(GiB)": 71.94,
"step": 3350,
"train_speed(iter/s)": 0.020006
},
{
"epoch": 1.3889463879114055,
"grad_norm": 1.9865374550113573,
"learning_rate": 3.1504802561366066e-06,
"loss": 0.62708969,
"memory(GiB)": 71.94,
"step": 3355,
"train_speed(iter/s)": 0.020007
},
{
"epoch": 1.3910163527220036,
"grad_norm": 1.8995100511755476,
"learning_rate": 3.1398078975453577e-06,
"loss": 0.64633865,
"memory(GiB)": 71.94,
"step": 3360,
"train_speed(iter/s)": 0.020008
},
{
"epoch": 1.3930863175326018,
"grad_norm": 2.3050512969189954,
"learning_rate": 3.129135538954109e-06,
"loss": 0.64496307,
"memory(GiB)": 71.94,
"step": 3365,
"train_speed(iter/s)": 0.020008
},
{
"epoch": 1.3951562823432002,
"grad_norm": 1.9056478303612696,
"learning_rate": 3.1184631803628606e-06,
"loss": 0.641465,
"memory(GiB)": 71.94,
"step": 3370,
"train_speed(iter/s)": 0.020009
},
{
"epoch": 1.3972262471537984,
"grad_norm": 2.166573531526341,
"learning_rate": 3.107790821771612e-06,
"loss": 0.64069824,
"memory(GiB)": 71.94,
"step": 3375,
"train_speed(iter/s)": 0.020009
},
{
"epoch": 1.3992962119643966,
"grad_norm": 1.9398811847394521,
"learning_rate": 3.097118463180363e-06,
"loss": 0.64301763,
"memory(GiB)": 47.6,
"step": 3380,
"train_speed(iter/s)": 0.02001
},
{
"epoch": 1.4013661767749948,
"grad_norm": 1.959159982373063,
"learning_rate": 3.086446104589114e-06,
"loss": 0.6362546,
"memory(GiB)": 47.6,
"step": 3385,
"train_speed(iter/s)": 0.020011
},
{
"epoch": 1.4034361415855932,
"grad_norm": 2.0530926805349083,
"learning_rate": 3.075773745997866e-06,
"loss": 0.63418798,
"memory(GiB)": 47.6,
"step": 3390,
"train_speed(iter/s)": 0.020013
},
{
"epoch": 1.4055061063961913,
"grad_norm": 1.9297510699914728,
"learning_rate": 3.065101387406617e-06,
"loss": 0.6311512,
"memory(GiB)": 47.6,
"step": 3395,
"train_speed(iter/s)": 0.020013
},
{
"epoch": 1.4075760712067895,
"grad_norm": 1.9271894163285872,
"learning_rate": 3.054429028815368e-06,
"loss": 0.63234644,
"memory(GiB)": 47.6,
"step": 3400,
"train_speed(iter/s)": 0.020014
},
{
"epoch": 1.4096460360173877,
"grad_norm": 1.7395603251397769,
"learning_rate": 3.04375667022412e-06,
"loss": 0.63525658,
"memory(GiB)": 47.6,
"step": 3405,
"train_speed(iter/s)": 0.020015
},
{
"epoch": 1.4117160008279859,
"grad_norm": 1.9592997788913435,
"learning_rate": 3.033084311632871e-06,
"loss": 0.62426691,
"memory(GiB)": 47.6,
"step": 3410,
"train_speed(iter/s)": 0.020016
},
{
"epoch": 1.413785965638584,
"grad_norm": 1.83244625095987,
"learning_rate": 3.0224119530416225e-06,
"loss": 0.64436603,
"memory(GiB)": 47.6,
"step": 3415,
"train_speed(iter/s)": 0.020016
},
{
"epoch": 1.4158559304491822,
"grad_norm": 1.9274094419667949,
"learning_rate": 3.0117395944503735e-06,
"loss": 0.65462785,
"memory(GiB)": 47.6,
"step": 3420,
"train_speed(iter/s)": 0.020017
},
{
"epoch": 1.4179258952597806,
"grad_norm": 1.9910550366499922,
"learning_rate": 3.0010672358591254e-06,
"loss": 0.62345576,
"memory(GiB)": 47.6,
"step": 3425,
"train_speed(iter/s)": 0.020017
},
{
"epoch": 1.4199958600703788,
"grad_norm": 1.9961790791051468,
"learning_rate": 2.9903948772678764e-06,
"loss": 0.62686663,
"memory(GiB)": 48.58,
"step": 3430,
"train_speed(iter/s)": 0.020018
},
{
"epoch": 1.422065824880977,
"grad_norm": 2.2471760043812767,
"learning_rate": 2.9797225186766275e-06,
"loss": 0.65531764,
"memory(GiB)": 48.58,
"step": 3435,
"train_speed(iter/s)": 0.020018
},
{
"epoch": 1.4241357896915752,
"grad_norm": 2.225238329922807,
"learning_rate": 2.9690501600853794e-06,
"loss": 0.64034252,
"memory(GiB)": 48.58,
"step": 3440,
"train_speed(iter/s)": 0.020019
},
{
"epoch": 1.4262057545021736,
"grad_norm": 1.6663799088887756,
"learning_rate": 2.9583778014941304e-06,
"loss": 0.61355238,
"memory(GiB)": 48.58,
"step": 3445,
"train_speed(iter/s)": 0.020019
},
{
"epoch": 1.4282757193127718,
"grad_norm": 1.9127892416039678,
"learning_rate": 2.947705442902882e-06,
"loss": 0.61548796,
"memory(GiB)": 48.58,
"step": 3450,
"train_speed(iter/s)": 0.02002
},
{
"epoch": 1.43034568412337,
"grad_norm": 1.7733393610398442,
"learning_rate": 2.937033084311633e-06,
"loss": 0.66395988,
"memory(GiB)": 48.58,
"step": 3455,
"train_speed(iter/s)": 0.02002
},
{
"epoch": 1.432415648933968,
"grad_norm": 2.1962868424551676,
"learning_rate": 2.9263607257203848e-06,
"loss": 0.61601906,
"memory(GiB)": 48.58,
"step": 3460,
"train_speed(iter/s)": 0.020021
},
{
"epoch": 1.4344856137445663,
"grad_norm": 1.8134260024404016,
"learning_rate": 2.915688367129136e-06,
"loss": 0.64126596,
"memory(GiB)": 48.58,
"step": 3465,
"train_speed(iter/s)": 0.020022
},
{
"epoch": 1.4365555785551645,
"grad_norm": 2.136457991499728,
"learning_rate": 2.905016008537887e-06,
"loss": 0.63421082,
"memory(GiB)": 48.58,
"step": 3470,
"train_speed(iter/s)": 0.020022
},
{
"epoch": 1.4386255433657626,
"grad_norm": 1.988532670614334,
"learning_rate": 2.8943436499466383e-06,
"loss": 0.63867459,
"memory(GiB)": 48.58,
"step": 3475,
"train_speed(iter/s)": 0.020023
},
{
"epoch": 1.440695508176361,
"grad_norm": 1.9277626836086974,
"learning_rate": 2.8836712913553898e-06,
"loss": 0.65572329,
"memory(GiB)": 48.58,
"step": 3480,
"train_speed(iter/s)": 0.020023
},
{
"epoch": 1.4427654729869592,
"grad_norm": 2.1432613819151247,
"learning_rate": 2.8729989327641412e-06,
"loss": 0.64992175,
"memory(GiB)": 48.58,
"step": 3485,
"train_speed(iter/s)": 0.020025
},
{
"epoch": 1.4448354377975574,
"grad_norm": 2.355567308350272,
"learning_rate": 2.8623265741728923e-06,
"loss": 0.66429825,
"memory(GiB)": 48.58,
"step": 3490,
"train_speed(iter/s)": 0.020026
},
{
"epoch": 1.4469054026081556,
"grad_norm": 2.045569698254085,
"learning_rate": 2.851654215581644e-06,
"loss": 0.65050926,
"memory(GiB)": 48.58,
"step": 3495,
"train_speed(iter/s)": 0.020026
},
{
"epoch": 1.448975367418754,
"grad_norm": 1.8389546498339602,
"learning_rate": 2.840981856990395e-06,
"loss": 0.63454714,
"memory(GiB)": 48.58,
"step": 3500,
"train_speed(iter/s)": 0.020027
},
{
"epoch": 1.4510453322293522,
"grad_norm": 2.1282462505999518,
"learning_rate": 2.8303094983991462e-06,
"loss": 0.61026077,
"memory(GiB)": 48.58,
"step": 3505,
"train_speed(iter/s)": 0.020027
},
{
"epoch": 1.4531152970399503,
"grad_norm": 1.9822876450283116,
"learning_rate": 2.8196371398078977e-06,
"loss": 0.63836317,
"memory(GiB)": 48.58,
"step": 3510,
"train_speed(iter/s)": 0.020028
},
{
"epoch": 1.4551852618505485,
"grad_norm": 2.1201754460326603,
"learning_rate": 2.808964781216649e-06,
"loss": 0.65191045,
"memory(GiB)": 48.58,
"step": 3515,
"train_speed(iter/s)": 0.020028
},
{
"epoch": 1.4572552266611467,
"grad_norm": 1.8520751525916404,
"learning_rate": 2.7982924226254006e-06,
"loss": 0.62120395,
"memory(GiB)": 48.58,
"step": 3520,
"train_speed(iter/s)": 0.02003
},
{
"epoch": 1.4593251914717449,
"grad_norm": 1.7983375554778653,
"learning_rate": 2.7876200640341517e-06,
"loss": 0.63227787,
"memory(GiB)": 48.58,
"step": 3525,
"train_speed(iter/s)": 0.02003
},
{
"epoch": 1.461395156282343,
"grad_norm": 2.340481282081477,
"learning_rate": 2.7769477054429036e-06,
"loss": 0.63372889,
"memory(GiB)": 48.58,
"step": 3530,
"train_speed(iter/s)": 0.020031
},
{
"epoch": 1.4634651210929415,
"grad_norm": 1.9262925464363927,
"learning_rate": 2.7662753468516546e-06,
"loss": 0.63678517,
"memory(GiB)": 48.58,
"step": 3535,
"train_speed(iter/s)": 0.020031
},
{
"epoch": 1.4655350859035396,
"grad_norm": 2.2093247947290164,
"learning_rate": 2.7556029882604056e-06,
"loss": 0.6558732,
"memory(GiB)": 48.58,
"step": 3540,
"train_speed(iter/s)": 0.020032
},
{
"epoch": 1.4676050507141378,
"grad_norm": 2.020333271689009,
"learning_rate": 2.744930629669157e-06,
"loss": 0.60733166,
"memory(GiB)": 48.58,
"step": 3545,
"train_speed(iter/s)": 0.020032
},
{
"epoch": 1.469675015524736,
"grad_norm": 2.0734276125917632,
"learning_rate": 2.7342582710779086e-06,
"loss": 0.66044526,
"memory(GiB)": 48.58,
"step": 3550,
"train_speed(iter/s)": 0.020033
},
{
"epoch": 1.4717449803353344,
"grad_norm": 1.8332312440056042,
"learning_rate": 2.72358591248666e-06,
"loss": 0.64025326,
"memory(GiB)": 48.58,
"step": 3555,
"train_speed(iter/s)": 0.020034
},
{
"epoch": 1.4738149451459326,
"grad_norm": 1.8856532947296074,
"learning_rate": 2.712913553895411e-06,
"loss": 0.63681345,
"memory(GiB)": 48.58,
"step": 3560,
"train_speed(iter/s)": 0.020034
},
{
"epoch": 1.4758849099565308,
"grad_norm": 1.6710776220883121,
"learning_rate": 2.702241195304162e-06,
"loss": 0.64032116,
"memory(GiB)": 48.58,
"step": 3565,
"train_speed(iter/s)": 0.020035
},
{
"epoch": 1.477954874767129,
"grad_norm": 2.095636589348373,
"learning_rate": 2.691568836712914e-06,
"loss": 0.63223238,
"memory(GiB)": 48.58,
"step": 3570,
"train_speed(iter/s)": 0.020035
},
{
"epoch": 1.4800248395777271,
"grad_norm": 1.8927929015268774,
"learning_rate": 2.680896478121665e-06,
"loss": 0.63061609,
"memory(GiB)": 48.58,
"step": 3575,
"train_speed(iter/s)": 0.020036
},
{
"epoch": 1.4820948043883253,
"grad_norm": 2.0929754486788137,
"learning_rate": 2.670224119530416e-06,
"loss": 0.62695656,
"memory(GiB)": 48.58,
"step": 3580,
"train_speed(iter/s)": 0.020037
},
{
"epoch": 1.4841647691989235,
"grad_norm": 1.7458918576306506,
"learning_rate": 2.659551760939168e-06,
"loss": 0.63322277,
"memory(GiB)": 48.58,
"step": 3585,
"train_speed(iter/s)": 0.020037
},
{
"epoch": 1.4862347340095219,
"grad_norm": 1.8907196532347343,
"learning_rate": 2.648879402347919e-06,
"loss": 0.63732347,
"memory(GiB)": 48.58,
"step": 3590,
"train_speed(iter/s)": 0.020038
},
{
"epoch": 1.48830469882012,
"grad_norm": 2.263161529685459,
"learning_rate": 2.6382070437566704e-06,
"loss": 0.6138607,
"memory(GiB)": 48.58,
"step": 3595,
"train_speed(iter/s)": 0.020039
},
{
"epoch": 1.4903746636307182,
"grad_norm": 2.572454412416673,
"learning_rate": 2.6275346851654215e-06,
"loss": 0.63049603,
"memory(GiB)": 48.58,
"step": 3600,
"train_speed(iter/s)": 0.02004
},
{
"epoch": 1.4903746636307182,
"eval_loss": 0.8007386922836304,
"eval_runtime": 333.8857,
"eval_samples_per_second": 18.707,
"eval_steps_per_second": 1.171,
"step": 3600
},
{
"epoch": 1.4924446284413164,
"grad_norm": 1.901029287627636,
"learning_rate": 2.6168623265741734e-06,
"loss": 0.65960011,
"memory(GiB)": 48.58,
"step": 3605,
"train_speed(iter/s)": 0.019998
},
{
"epoch": 1.4945145932519148,
"grad_norm": 2.0108141055428423,
"learning_rate": 2.6061899679829244e-06,
"loss": 0.63098369,
"memory(GiB)": 48.58,
"step": 3610,
"train_speed(iter/s)": 0.019999
},
{
"epoch": 1.496584558062513,
"grad_norm": 1.8385014047339334,
"learning_rate": 2.5955176093916754e-06,
"loss": 0.63654456,
"memory(GiB)": 48.58,
"step": 3615,
"train_speed(iter/s)": 0.019999
},
{
"epoch": 1.4986545228731112,
"grad_norm": 2.502391600466987,
"learning_rate": 2.5848452508004273e-06,
"loss": 0.64129109,
"memory(GiB)": 48.58,
"step": 3620,
"train_speed(iter/s)": 0.02
},
{
"epoch": 1.5007244876837094,
"grad_norm": 1.8078536683803752,
"learning_rate": 2.5741728922091784e-06,
"loss": 0.61762905,
"memory(GiB)": 48.58,
"step": 3625,
"train_speed(iter/s)": 0.020001
},
{
"epoch": 1.5027944524943075,
"grad_norm": 2.4124021719958813,
"learning_rate": 2.56350053361793e-06,
"loss": 0.62886133,
"memory(GiB)": 48.58,
"step": 3630,
"train_speed(iter/s)": 0.020002
},
{
"epoch": 1.5048644173049057,
"grad_norm": 2.14818078370575,
"learning_rate": 2.552828175026681e-06,
"loss": 0.64229774,
"memory(GiB)": 48.58,
"step": 3635,
"train_speed(iter/s)": 0.020002
},
{
"epoch": 1.506934382115504,
"grad_norm": 2.0607592121598777,
"learning_rate": 2.5421558164354328e-06,
"loss": 0.62922821,
"memory(GiB)": 48.58,
"step": 3640,
"train_speed(iter/s)": 0.020003
},
{
"epoch": 1.5090043469261023,
"grad_norm": 1.9537072105756903,
"learning_rate": 2.531483457844184e-06,
"loss": 0.6114254,
"memory(GiB)": 48.58,
"step": 3645,
"train_speed(iter/s)": 0.020004
},
{
"epoch": 1.5110743117367005,
"grad_norm": 2.076894133768813,
"learning_rate": 2.520811099252935e-06,
"loss": 0.65702705,
"memory(GiB)": 48.58,
"step": 3650,
"train_speed(iter/s)": 0.020005
},
{
"epoch": 1.5131442765472987,
"grad_norm": 1.8684495033846724,
"learning_rate": 2.5101387406616863e-06,
"loss": 0.65235605,
"memory(GiB)": 48.58,
"step": 3655,
"train_speed(iter/s)": 0.020005
},
{
"epoch": 1.515214241357897,
"grad_norm": 2.099820461610625,
"learning_rate": 2.4994663820704378e-06,
"loss": 0.63383131,
"memory(GiB)": 48.58,
"step": 3660,
"train_speed(iter/s)": 0.020005
},
{
"epoch": 1.5172842061684952,
"grad_norm": 2.0251264820061765,
"learning_rate": 2.4887940234791892e-06,
"loss": 0.63897676,
"memory(GiB)": 48.58,
"step": 3665,
"train_speed(iter/s)": 0.020006
},
{
"epoch": 1.5193541709790934,
"grad_norm": 1.8916351044126378,
"learning_rate": 2.4781216648879407e-06,
"loss": 0.63116732,
"memory(GiB)": 48.58,
"step": 3670,
"train_speed(iter/s)": 0.020007
},
{
"epoch": 1.5214241357896916,
"grad_norm": 1.758567214100863,
"learning_rate": 2.4674493062966917e-06,
"loss": 0.61664515,
"memory(GiB)": 48.58,
"step": 3675,
"train_speed(iter/s)": 0.020008
},
{
"epoch": 1.5234941006002898,
"grad_norm": 2.004985649591871,
"learning_rate": 2.456776947705443e-06,
"loss": 0.63878107,
"memory(GiB)": 48.58,
"step": 3680,
"train_speed(iter/s)": 0.020008
},
{
"epoch": 1.525564065410888,
"grad_norm": 2.0496876810492894,
"learning_rate": 2.4461045891141942e-06,
"loss": 0.62643003,
"memory(GiB)": 48.58,
"step": 3685,
"train_speed(iter/s)": 0.020009
},
{
"epoch": 1.5276340302214861,
"grad_norm": 1.9829235096697742,
"learning_rate": 2.4354322305229457e-06,
"loss": 0.62055197,
"memory(GiB)": 48.58,
"step": 3690,
"train_speed(iter/s)": 0.02001
},
{
"epoch": 1.5297039950320843,
"grad_norm": 1.7991062631513268,
"learning_rate": 2.424759871931697e-06,
"loss": 0.6605298,
"memory(GiB)": 48.58,
"step": 3695,
"train_speed(iter/s)": 0.02001
},
{
"epoch": 1.5317739598426827,
"grad_norm": 2.526220221053364,
"learning_rate": 2.4140875133404486e-06,
"loss": 0.65973449,
"memory(GiB)": 48.58,
"step": 3700,
"train_speed(iter/s)": 0.020011
},
{
"epoch": 1.533843924653281,
"grad_norm": 1.682540853202414,
"learning_rate": 2.4034151547492e-06,
"loss": 0.65421052,
"memory(GiB)": 48.58,
"step": 3705,
"train_speed(iter/s)": 0.020012
},
{
"epoch": 1.535913889463879,
"grad_norm": 1.965734649630595,
"learning_rate": 2.392742796157951e-06,
"loss": 0.64042482,
"memory(GiB)": 48.58,
"step": 3710,
"train_speed(iter/s)": 0.020013
},
{
"epoch": 1.5379838542744775,
"grad_norm": 2.2061994907060005,
"learning_rate": 2.3820704375667026e-06,
"loss": 0.61186495,
"memory(GiB)": 48.58,
"step": 3715,
"train_speed(iter/s)": 0.020014
},
{
"epoch": 1.5400538190850757,
"grad_norm": 2.2749623686643474,
"learning_rate": 2.3713980789754536e-06,
"loss": 0.63306904,
"memory(GiB)": 48.58,
"step": 3720,
"train_speed(iter/s)": 0.020014
},
{
"epoch": 1.5421237838956738,
"grad_norm": 1.8676661262594305,
"learning_rate": 2.360725720384205e-06,
"loss": 0.64988294,
"memory(GiB)": 48.58,
"step": 3725,
"train_speed(iter/s)": 0.020015
},
{
"epoch": 1.544193748706272,
"grad_norm": 2.069377222520985,
"learning_rate": 2.3500533617929565e-06,
"loss": 0.63958693,
"memory(GiB)": 48.58,
"step": 3730,
"train_speed(iter/s)": 0.020016
},
{
"epoch": 1.5462637135168702,
"grad_norm": 2.0148406212108574,
"learning_rate": 2.339381003201708e-06,
"loss": 0.60387516,
"memory(GiB)": 48.58,
"step": 3735,
"train_speed(iter/s)": 0.020016
},
{
"epoch": 1.5483336783274684,
"grad_norm": 1.8592072092273917,
"learning_rate": 2.328708644610459e-06,
"loss": 0.60533466,
"memory(GiB)": 48.58,
"step": 3740,
"train_speed(iter/s)": 0.020017
},
{
"epoch": 1.5504036431380666,
"grad_norm": 1.9293967097992641,
"learning_rate": 2.3180362860192105e-06,
"loss": 0.64885559,
"memory(GiB)": 48.58,
"step": 3745,
"train_speed(iter/s)": 0.020018
},
{
"epoch": 1.5524736079486647,
"grad_norm": 2.105150298236883,
"learning_rate": 2.307363927427962e-06,
"loss": 0.6291214,
"memory(GiB)": 48.58,
"step": 3750,
"train_speed(iter/s)": 0.020019
},
{
"epoch": 1.5545435727592631,
"grad_norm": 1.7706279189258218,
"learning_rate": 2.296691568836713e-06,
"loss": 0.62637095,
"memory(GiB)": 48.58,
"step": 3755,
"train_speed(iter/s)": 0.020019
},
{
"epoch": 1.5566135375698613,
"grad_norm": 2.565565704557741,
"learning_rate": 2.2860192102454645e-06,
"loss": 0.61526871,
"memory(GiB)": 48.58,
"step": 3760,
"train_speed(iter/s)": 0.02002
},
{
"epoch": 1.5586835023804595,
"grad_norm": 2.07386908713523,
"learning_rate": 2.2753468516542155e-06,
"loss": 0.62105417,
"memory(GiB)": 48.58,
"step": 3765,
"train_speed(iter/s)": 0.02002
},
{
"epoch": 1.560753467191058,
"grad_norm": 2.0656516897338064,
"learning_rate": 2.264674493062967e-06,
"loss": 0.62474051,
"memory(GiB)": 48.58,
"step": 3770,
"train_speed(iter/s)": 0.020021
},
{
"epoch": 1.562823432001656,
"grad_norm": 2.1881677769227075,
"learning_rate": 2.2540021344717184e-06,
"loss": 0.6300148,
"memory(GiB)": 48.58,
"step": 3775,
"train_speed(iter/s)": 0.020022
},
{
"epoch": 1.5648933968122543,
"grad_norm": 1.9586494376217993,
"learning_rate": 2.24332977588047e-06,
"loss": 0.59469123,
"memory(GiB)": 48.58,
"step": 3780,
"train_speed(iter/s)": 0.020023
},
{
"epoch": 1.5669633616228524,
"grad_norm": 2.440599800961732,
"learning_rate": 2.232657417289221e-06,
"loss": 0.62468157,
"memory(GiB)": 48.58,
"step": 3785,
"train_speed(iter/s)": 0.020024
},
{
"epoch": 1.5690333264334506,
"grad_norm": 2.0016886010949797,
"learning_rate": 2.2219850586979724e-06,
"loss": 0.62775316,
"memory(GiB)": 48.58,
"step": 3790,
"train_speed(iter/s)": 0.020025
},
{
"epoch": 1.5711032912440488,
"grad_norm": 1.9086880143876215,
"learning_rate": 2.211312700106724e-06,
"loss": 0.62862492,
"memory(GiB)": 48.58,
"step": 3795,
"train_speed(iter/s)": 0.020026
},
{
"epoch": 1.573173256054647,
"grad_norm": 2.0888610038198636,
"learning_rate": 2.200640341515475e-06,
"loss": 0.59845972,
"memory(GiB)": 48.58,
"step": 3800,
"train_speed(iter/s)": 0.020026
},
{
"epoch": 1.5752432208652452,
"grad_norm": 2.055969507055363,
"learning_rate": 2.1899679829242263e-06,
"loss": 0.61734905,
"memory(GiB)": 48.58,
"step": 3805,
"train_speed(iter/s)": 0.020027
},
{
"epoch": 1.5773131856758436,
"grad_norm": 1.9707520677944765,
"learning_rate": 2.179295624332978e-06,
"loss": 0.62550197,
"memory(GiB)": 48.58,
"step": 3810,
"train_speed(iter/s)": 0.020028
},
{
"epoch": 1.5793831504864417,
"grad_norm": 2.1321630269454617,
"learning_rate": 2.1686232657417293e-06,
"loss": 0.62156429,
"memory(GiB)": 48.58,
"step": 3815,
"train_speed(iter/s)": 0.020029
},
{
"epoch": 1.58145311529704,
"grad_norm": 2.3313380056807373,
"learning_rate": 2.1579509071504803e-06,
"loss": 0.62489176,
"memory(GiB)": 48.58,
"step": 3820,
"train_speed(iter/s)": 0.02003
},
{
"epoch": 1.5835230801076383,
"grad_norm": 2.0579013534602044,
"learning_rate": 2.1472785485592318e-06,
"loss": 0.60487609,
"memory(GiB)": 48.58,
"step": 3825,
"train_speed(iter/s)": 0.02003
},
{
"epoch": 1.5855930449182365,
"grad_norm": 2.1449642041698267,
"learning_rate": 2.136606189967983e-06,
"loss": 0.63105164,
"memory(GiB)": 48.58,
"step": 3830,
"train_speed(iter/s)": 0.020031
},
{
"epoch": 1.5876630097288347,
"grad_norm": 1.8838252023426636,
"learning_rate": 2.1259338313767343e-06,
"loss": 0.62664189,
"memory(GiB)": 48.58,
"step": 3835,
"train_speed(iter/s)": 0.020032
},
{
"epoch": 1.5897329745394329,
"grad_norm": 2.289021375023386,
"learning_rate": 2.1152614727854857e-06,
"loss": 0.60014114,
"memory(GiB)": 48.58,
"step": 3840,
"train_speed(iter/s)": 0.020032
},
{
"epoch": 1.591802939350031,
"grad_norm": 1.952623771952289,
"learning_rate": 2.104589114194237e-06,
"loss": 0.63038387,
"memory(GiB)": 48.58,
"step": 3845,
"train_speed(iter/s)": 0.020032
},
{
"epoch": 1.5938729041606292,
"grad_norm": 2.151088803547346,
"learning_rate": 2.0939167556029887e-06,
"loss": 0.64208837,
"memory(GiB)": 48.58,
"step": 3850,
"train_speed(iter/s)": 0.020033
},
{
"epoch": 1.5959428689712274,
"grad_norm": 2.0667970850832025,
"learning_rate": 2.0832443970117397e-06,
"loss": 0.62941227,
"memory(GiB)": 48.58,
"step": 3855,
"train_speed(iter/s)": 0.020034
},
{
"epoch": 1.5980128337818256,
"grad_norm": 1.8287162112237207,
"learning_rate": 2.072572038420491e-06,
"loss": 0.60888386,
"memory(GiB)": 48.58,
"step": 3860,
"train_speed(iter/s)": 0.020035
},
{
"epoch": 1.600082798592424,
"grad_norm": 2.083482144562111,
"learning_rate": 2.061899679829242e-06,
"loss": 0.60378475,
"memory(GiB)": 48.58,
"step": 3865,
"train_speed(iter/s)": 0.020035
},
{
"epoch": 1.6021527634030222,
"grad_norm": 2.3064863216709925,
"learning_rate": 2.0512273212379937e-06,
"loss": 0.60522232,
"memory(GiB)": 48.58,
"step": 3870,
"train_speed(iter/s)": 0.020036
},
{
"epoch": 1.6042227282136203,
"grad_norm": 2.1700368490113844,
"learning_rate": 2.040554962646745e-06,
"loss": 0.61363611,
"memory(GiB)": 48.58,
"step": 3875,
"train_speed(iter/s)": 0.020037
},
{
"epoch": 1.6062926930242187,
"grad_norm": 2.2038759685376843,
"learning_rate": 2.0298826040554966e-06,
"loss": 0.60480423,
"memory(GiB)": 48.58,
"step": 3880,
"train_speed(iter/s)": 0.020037
},
{
"epoch": 1.608362657834817,
"grad_norm": 2.1947427482623914,
"learning_rate": 2.019210245464248e-06,
"loss": 0.61413918,
"memory(GiB)": 48.58,
"step": 3885,
"train_speed(iter/s)": 0.020038
},
{
"epoch": 1.610432622645415,
"grad_norm": 2.022770825774821,
"learning_rate": 2.008537886872999e-06,
"loss": 0.61320429,
"memory(GiB)": 48.58,
"step": 3890,
"train_speed(iter/s)": 0.020039
},
{
"epoch": 1.6125025874560133,
"grad_norm": 2.052392899511488,
"learning_rate": 1.9978655282817505e-06,
"loss": 0.6130487,
"memory(GiB)": 48.58,
"step": 3895,
"train_speed(iter/s)": 0.020039
},
{
"epoch": 1.6145725522666114,
"grad_norm": 2.0156586912928596,
"learning_rate": 1.9871931696905016e-06,
"loss": 0.64238014,
"memory(GiB)": 48.58,
"step": 3900,
"train_speed(iter/s)": 0.02004
},
{
"epoch": 1.6145725522666114,
"eval_loss": 0.79938143491745,
"eval_runtime": 333.9342,
"eval_samples_per_second": 18.704,
"eval_steps_per_second": 1.171,
"step": 3900
},
{
"epoch": 1.6166425170772096,
"grad_norm": 2.023426003459966,
"learning_rate": 1.976520811099253e-06,
"loss": 0.61180491,
"memory(GiB)": 48.58,
"step": 3905,
"train_speed(iter/s)": 0.020001
},
{
"epoch": 1.6187124818878078,
"grad_norm": 2.053001775702235,
"learning_rate": 1.9658484525080045e-06,
"loss": 0.62299452,
"memory(GiB)": 48.58,
"step": 3910,
"train_speed(iter/s)": 0.020002
},
{
"epoch": 1.620782446698406,
"grad_norm": 1.867769590995953,
"learning_rate": 1.955176093916756e-06,
"loss": 0.61298056,
"memory(GiB)": 48.58,
"step": 3915,
"train_speed(iter/s)": 0.020002
},
{
"epoch": 1.6228524115090044,
"grad_norm": 2.3195386012179404,
"learning_rate": 1.944503735325507e-06,
"loss": 0.62579803,
"memory(GiB)": 48.58,
"step": 3920,
"train_speed(iter/s)": 0.020003
},
{
"epoch": 1.6249223763196026,
"grad_norm": 2.0728629845134825,
"learning_rate": 1.9338313767342585e-06,
"loss": 0.60280285,
"memory(GiB)": 48.58,
"step": 3925,
"train_speed(iter/s)": 0.020004
},
{
"epoch": 1.6269923411302007,
"grad_norm": 1.9603574858854882,
"learning_rate": 1.92315901814301e-06,
"loss": 0.61659031,
"memory(GiB)": 48.58,
"step": 3930,
"train_speed(iter/s)": 0.020005
},
{
"epoch": 1.6290623059407991,
"grad_norm": 2.1782071337826014,
"learning_rate": 1.912486659551761e-06,
"loss": 0.60701981,
"memory(GiB)": 48.58,
"step": 3935,
"train_speed(iter/s)": 0.020006
},
{
"epoch": 1.6311322707513973,
"grad_norm": 1.7195215240069541,
"learning_rate": 1.9018143009605124e-06,
"loss": 0.59961052,
"memory(GiB)": 48.58,
"step": 3940,
"train_speed(iter/s)": 0.020007
},
{
"epoch": 1.6332022355619955,
"grad_norm": 2.076700307294715,
"learning_rate": 1.8911419423692637e-06,
"loss": 0.61397924,
"memory(GiB)": 48.58,
"step": 3945,
"train_speed(iter/s)": 0.020008
},
{
"epoch": 1.6352722003725937,
"grad_norm": 2.1171856192990615,
"learning_rate": 1.8804695837780151e-06,
"loss": 0.61544151,
"memory(GiB)": 48.58,
"step": 3950,
"train_speed(iter/s)": 0.020008
},
{
"epoch": 1.6373421651831919,
"grad_norm": 2.2158147695337838,
"learning_rate": 1.8697972251867664e-06,
"loss": 0.62159052,
"memory(GiB)": 48.58,
"step": 3955,
"train_speed(iter/s)": 0.020009
},
{
"epoch": 1.63941212999379,
"grad_norm": 1.8635077630631116,
"learning_rate": 1.8591248665955179e-06,
"loss": 0.62834597,
"memory(GiB)": 48.58,
"step": 3960,
"train_speed(iter/s)": 0.02001
},
{
"epoch": 1.6414820948043882,
"grad_norm": 1.9610933756803843,
"learning_rate": 1.8484525080042693e-06,
"loss": 0.61779599,
"memory(GiB)": 48.58,
"step": 3965,
"train_speed(iter/s)": 0.020011
},
{
"epoch": 1.6435520596149864,
"grad_norm": 1.9029383953647434,
"learning_rate": 1.8377801494130204e-06,
"loss": 0.61077566,
"memory(GiB)": 48.58,
"step": 3970,
"train_speed(iter/s)": 0.020011
},
{
"epoch": 1.6456220244255848,
"grad_norm": 2.0846014405355926,
"learning_rate": 1.8271077908217718e-06,
"loss": 0.61311092,
"memory(GiB)": 48.58,
"step": 3975,
"train_speed(iter/s)": 0.020012
},
{
"epoch": 1.647691989236183,
"grad_norm": 2.390238831095919,
"learning_rate": 1.816435432230523e-06,
"loss": 0.62842617,
"memory(GiB)": 48.58,
"step": 3980,
"train_speed(iter/s)": 0.020013
},
{
"epoch": 1.6497619540467812,
"grad_norm": 1.8650578046674164,
"learning_rate": 1.8057630736392745e-06,
"loss": 0.62033787,
"memory(GiB)": 48.58,
"step": 3985,
"train_speed(iter/s)": 0.020014
},
{
"epoch": 1.6518319188573796,
"grad_norm": 1.9997553186688102,
"learning_rate": 1.7950907150480258e-06,
"loss": 0.61835356,
"memory(GiB)": 48.58,
"step": 3990,
"train_speed(iter/s)": 0.020015
},
{
"epoch": 1.6539018836679777,
"grad_norm": 2.0185275543308,
"learning_rate": 1.7844183564567772e-06,
"loss": 0.62564411,
"memory(GiB)": 48.58,
"step": 3995,
"train_speed(iter/s)": 0.020015
},
{
"epoch": 1.655971848478576,
"grad_norm": 1.6493866482983615,
"learning_rate": 1.7737459978655283e-06,
"loss": 0.62476611,
"memory(GiB)": 48.58,
"step": 4000,
"train_speed(iter/s)": 0.020016
},
{
"epoch": 1.658041813289174,
"grad_norm": 2.1258878217039157,
"learning_rate": 1.7630736392742797e-06,
"loss": 0.61094875,
"memory(GiB)": 48.58,
"step": 4005,
"train_speed(iter/s)": 0.020017
},
{
"epoch": 1.6601117780997723,
"grad_norm": 1.8264630776919117,
"learning_rate": 1.7524012806830312e-06,
"loss": 0.61455221,
"memory(GiB)": 48.58,
"step": 4010,
"train_speed(iter/s)": 0.020018
},
{
"epoch": 1.6621817429103705,
"grad_norm": 2.1834934664197148,
"learning_rate": 1.7417289220917825e-06,
"loss": 0.61525717,
"memory(GiB)": 48.58,
"step": 4015,
"train_speed(iter/s)": 0.020018
},
{
"epoch": 1.6642517077209686,
"grad_norm": 1.951531129714831,
"learning_rate": 1.731056563500534e-06,
"loss": 0.61558003,
"memory(GiB)": 48.58,
"step": 4020,
"train_speed(iter/s)": 0.020019
},
{
"epoch": 1.6663216725315668,
"grad_norm": 2.4480446682769315,
"learning_rate": 1.720384204909285e-06,
"loss": 0.58283405,
"memory(GiB)": 48.58,
"step": 4025,
"train_speed(iter/s)": 0.02002
},
{
"epoch": 1.6683916373421652,
"grad_norm": 1.7243360581014375,
"learning_rate": 1.7097118463180364e-06,
"loss": 0.60816731,
"memory(GiB)": 48.58,
"step": 4030,
"train_speed(iter/s)": 0.020021
},
{
"epoch": 1.6704616021527634,
"grad_norm": 1.935378370315751,
"learning_rate": 1.6990394877267877e-06,
"loss": 0.59363813,
"memory(GiB)": 48.58,
"step": 4035,
"train_speed(iter/s)": 0.020021
},
{
"epoch": 1.6725315669633616,
"grad_norm": 2.343341523870396,
"learning_rate": 1.6883671291355391e-06,
"loss": 0.59670277,
"memory(GiB)": 48.58,
"step": 4040,
"train_speed(iter/s)": 0.020022
},
{
"epoch": 1.67460153177396,
"grad_norm": 2.002429630691932,
"learning_rate": 1.6776947705442904e-06,
"loss": 0.61318674,
"memory(GiB)": 48.58,
"step": 4045,
"train_speed(iter/s)": 0.020022
},
{
"epoch": 1.6766714965845582,
"grad_norm": 2.161836907625474,
"learning_rate": 1.6670224119530418e-06,
"loss": 0.63845253,
"memory(GiB)": 48.58,
"step": 4050,
"train_speed(iter/s)": 0.020023
},
{
"epoch": 1.6787414613951563,
"grad_norm": 2.3095783835557993,
"learning_rate": 1.6563500533617933e-06,
"loss": 0.61825991,
"memory(GiB)": 48.58,
"step": 4055,
"train_speed(iter/s)": 0.020023
},
{
"epoch": 1.6808114262057545,
"grad_norm": 2.0857994277393326,
"learning_rate": 1.6456776947705443e-06,
"loss": 0.58271861,
"memory(GiB)": 48.58,
"step": 4060,
"train_speed(iter/s)": 0.020024
},
{
"epoch": 1.6828813910163527,
"grad_norm": 2.591242513945162,
"learning_rate": 1.6350053361792958e-06,
"loss": 0.59729037,
"memory(GiB)": 48.58,
"step": 4065,
"train_speed(iter/s)": 0.020025
},
{
"epoch": 1.6849513558269509,
"grad_norm": 2.2408993824641836,
"learning_rate": 1.624332977588047e-06,
"loss": 0.61958027,
"memory(GiB)": 48.58,
"step": 4070,
"train_speed(iter/s)": 0.020026
},
{
"epoch": 1.687021320637549,
"grad_norm": 2.07836511660639,
"learning_rate": 1.6136606189967985e-06,
"loss": 0.61059999,
"memory(GiB)": 48.58,
"step": 4075,
"train_speed(iter/s)": 0.020026
},
{
"epoch": 1.6890912854481472,
"grad_norm": 2.1885333872092767,
"learning_rate": 1.6029882604055498e-06,
"loss": 0.61813364,
"memory(GiB)": 48.58,
"step": 4080,
"train_speed(iter/s)": 0.020027
},
{
"epoch": 1.6911612502587456,
"grad_norm": 2.0563236890963075,
"learning_rate": 1.5923159018143012e-06,
"loss": 0.62983589,
"memory(GiB)": 48.58,
"step": 4085,
"train_speed(iter/s)": 0.020027
},
{
"epoch": 1.6932312150693438,
"grad_norm": 2.205936525088634,
"learning_rate": 1.5816435432230523e-06,
"loss": 0.6022356,
"memory(GiB)": 48.58,
"step": 4090,
"train_speed(iter/s)": 0.020028
},
{
"epoch": 1.695301179879942,
"grad_norm": 2.1192285983309262,
"learning_rate": 1.5709711846318037e-06,
"loss": 0.61405392,
"memory(GiB)": 48.58,
"step": 4095,
"train_speed(iter/s)": 0.020029
},
{
"epoch": 1.6973711446905404,
"grad_norm": 2.2747714483339676,
"learning_rate": 1.5602988260405552e-06,
"loss": 0.59890566,
"memory(GiB)": 48.58,
"step": 4100,
"train_speed(iter/s)": 0.02003
},
{
"epoch": 1.6994411095011386,
"grad_norm": 1.7865480667421139,
"learning_rate": 1.5496264674493064e-06,
"loss": 0.59438276,
"memory(GiB)": 48.58,
"step": 4105,
"train_speed(iter/s)": 0.02003
},
{
"epoch": 1.7015110743117368,
"grad_norm": 2.1136132702931953,
"learning_rate": 1.538954108858058e-06,
"loss": 0.58430662,
"memory(GiB)": 48.58,
"step": 4110,
"train_speed(iter/s)": 0.020031
},
{
"epoch": 1.703581039122335,
"grad_norm": 1.8888155653077559,
"learning_rate": 1.528281750266809e-06,
"loss": 0.61480141,
"memory(GiB)": 48.58,
"step": 4115,
"train_speed(iter/s)": 0.020032
},
{
"epoch": 1.7056510039329331,
"grad_norm": 2.1453583525948567,
"learning_rate": 1.5176093916755604e-06,
"loss": 0.60317545,
"memory(GiB)": 48.58,
"step": 4120,
"train_speed(iter/s)": 0.020033
},
{
"epoch": 1.7077209687435313,
"grad_norm": 2.0869705530610174,
"learning_rate": 1.5069370330843117e-06,
"loss": 0.57888694,
"memory(GiB)": 48.58,
"step": 4125,
"train_speed(iter/s)": 0.020033
},
{
"epoch": 1.7097909335541295,
"grad_norm": 2.0589261216950177,
"learning_rate": 1.4962646744930631e-06,
"loss": 0.63887987,
"memory(GiB)": 48.58,
"step": 4130,
"train_speed(iter/s)": 0.020034
},
{
"epoch": 1.7118608983647277,
"grad_norm": 2.246554976018598,
"learning_rate": 1.4855923159018144e-06,
"loss": 0.60080147,
"memory(GiB)": 48.58,
"step": 4135,
"train_speed(iter/s)": 0.020035
},
{
"epoch": 1.713930863175326,
"grad_norm": 2.2465140455191377,
"learning_rate": 1.4749199573105658e-06,
"loss": 0.62593145,
"memory(GiB)": 48.58,
"step": 4140,
"train_speed(iter/s)": 0.020035
},
{
"epoch": 1.7160008279859242,
"grad_norm": 2.1975792886927135,
"learning_rate": 1.4642475987193173e-06,
"loss": 0.61860814,
"memory(GiB)": 48.58,
"step": 4145,
"train_speed(iter/s)": 0.020036
},
{
"epoch": 1.7180707927965224,
"grad_norm": 1.8897839022485312,
"learning_rate": 1.4535752401280683e-06,
"loss": 0.62411423,
"memory(GiB)": 48.58,
"step": 4150,
"train_speed(iter/s)": 0.020037
},
{
"epoch": 1.7201407576071208,
"grad_norm": 2.4140521953157794,
"learning_rate": 1.4429028815368198e-06,
"loss": 0.60574412,
"memory(GiB)": 48.58,
"step": 4155,
"train_speed(iter/s)": 0.020038
},
{
"epoch": 1.722210722417719,
"grad_norm": 2.04528954072566,
"learning_rate": 1.432230522945571e-06,
"loss": 0.61065197,
"memory(GiB)": 48.58,
"step": 4160,
"train_speed(iter/s)": 0.020038
},
{
"epoch": 1.7242806872283172,
"grad_norm": 2.544468455409069,
"learning_rate": 1.4215581643543225e-06,
"loss": 0.59008269,
"memory(GiB)": 48.58,
"step": 4165,
"train_speed(iter/s)": 0.020039
},
{
"epoch": 1.7263506520389154,
"grad_norm": 1.9933726482621115,
"learning_rate": 1.4108858057630738e-06,
"loss": 0.61828232,
"memory(GiB)": 48.58,
"step": 4170,
"train_speed(iter/s)": 0.020039
},
{
"epoch": 1.7284206168495135,
"grad_norm": 2.0938798089462702,
"learning_rate": 1.4002134471718252e-06,
"loss": 0.63251686,
"memory(GiB)": 48.58,
"step": 4175,
"train_speed(iter/s)": 0.020039
},
{
"epoch": 1.7304905816601117,
"grad_norm": 2.586866443962492,
"learning_rate": 1.3895410885805763e-06,
"loss": 0.61843009,
"memory(GiB)": 48.58,
"step": 4180,
"train_speed(iter/s)": 0.02004
},
{
"epoch": 1.73256054647071,
"grad_norm": 2.157748885604068,
"learning_rate": 1.3788687299893277e-06,
"loss": 0.61311278,
"memory(GiB)": 48.58,
"step": 4185,
"train_speed(iter/s)": 0.020041
},
{
"epoch": 1.734630511281308,
"grad_norm": 2.1925495515847944,
"learning_rate": 1.3681963713980792e-06,
"loss": 0.61852412,
"memory(GiB)": 48.58,
"step": 4190,
"train_speed(iter/s)": 0.020041
},
{
"epoch": 1.7367004760919065,
"grad_norm": 2.0519214102963566,
"learning_rate": 1.3575240128068304e-06,
"loss": 0.60840869,
"memory(GiB)": 48.58,
"step": 4195,
"train_speed(iter/s)": 0.020041
},
{
"epoch": 1.7387704409025047,
"grad_norm": 2.1138169542373335,
"learning_rate": 1.3468516542155819e-06,
"loss": 0.60299668,
"memory(GiB)": 48.58,
"step": 4200,
"train_speed(iter/s)": 0.020042
},
{
"epoch": 1.7387704409025047,
"eval_loss": 0.7988836765289307,
"eval_runtime": 333.377,
"eval_samples_per_second": 18.736,
"eval_steps_per_second": 1.173,
"step": 4200
},
{
"epoch": 1.7408404057131028,
"grad_norm": 1.8176186505765208,
"learning_rate": 1.336179295624333e-06,
"loss": 0.59205551,
"memory(GiB)": 48.58,
"step": 4205,
"train_speed(iter/s)": 0.020007
},
{
"epoch": 1.7429103705237012,
"grad_norm": 2.097301273083197,
"learning_rate": 1.3255069370330844e-06,
"loss": 0.59544134,
"memory(GiB)": 48.58,
"step": 4210,
"train_speed(iter/s)": 0.020007
},
{
"epoch": 1.7449803353342994,
"grad_norm": 2.1754814306718093,
"learning_rate": 1.3148345784418356e-06,
"loss": 0.61374321,
"memory(GiB)": 48.58,
"step": 4215,
"train_speed(iter/s)": 0.020008
},
{
"epoch": 1.7470503001448976,
"grad_norm": 2.1091284971781543,
"learning_rate": 1.304162219850587e-06,
"loss": 0.58602142,
"memory(GiB)": 48.58,
"step": 4220,
"train_speed(iter/s)": 0.020008
},
{
"epoch": 1.7491202649554958,
"grad_norm": 2.1153060430505177,
"learning_rate": 1.2934898612593383e-06,
"loss": 0.59698052,
"memory(GiB)": 48.58,
"step": 4225,
"train_speed(iter/s)": 0.020009
},
{
"epoch": 1.751190229766094,
"grad_norm": 2.3476551963062193,
"learning_rate": 1.2828175026680898e-06,
"loss": 0.63100615,
"memory(GiB)": 48.58,
"step": 4230,
"train_speed(iter/s)": 0.020009
},
{
"epoch": 1.7532601945766921,
"grad_norm": 2.1723309903381987,
"learning_rate": 1.2721451440768413e-06,
"loss": 0.61266661,
"memory(GiB)": 48.58,
"step": 4235,
"train_speed(iter/s)": 0.02001
},
{
"epoch": 1.7553301593872903,
"grad_norm": 2.4855935431363267,
"learning_rate": 1.2614727854855923e-06,
"loss": 0.60998316,
"memory(GiB)": 48.58,
"step": 4240,
"train_speed(iter/s)": 0.020011
},
{
"epoch": 1.7574001241978885,
"grad_norm": 2.5103101197105233,
"learning_rate": 1.2508004268943438e-06,
"loss": 0.61193109,
"memory(GiB)": 48.58,
"step": 4245,
"train_speed(iter/s)": 0.020011
},
{
"epoch": 1.759470089008487,
"grad_norm": 2.2253279455991195,
"learning_rate": 1.2401280683030952e-06,
"loss": 0.58802786,
"memory(GiB)": 48.58,
"step": 4250,
"train_speed(iter/s)": 0.020012
},
{
"epoch": 1.761540053819085,
"grad_norm": 2.412011728154561,
"learning_rate": 1.2294557097118465e-06,
"loss": 0.6127542,
"memory(GiB)": 48.58,
"step": 4255,
"train_speed(iter/s)": 0.020013
},
{
"epoch": 1.7636100186296833,
"grad_norm": 2.186989181036045,
"learning_rate": 1.2187833511205977e-06,
"loss": 0.59003277,
"memory(GiB)": 48.58,
"step": 4260,
"train_speed(iter/s)": 0.020013
},
{
"epoch": 1.7656799834402817,
"grad_norm": 2.528633630931418,
"learning_rate": 1.2081109925293492e-06,
"loss": 0.61563702,
"memory(GiB)": 48.58,
"step": 4265,
"train_speed(iter/s)": 0.020014
},
{
"epoch": 1.7677499482508798,
"grad_norm": 1.9867035049680128,
"learning_rate": 1.1974386339381004e-06,
"loss": 0.60806894,
"memory(GiB)": 48.58,
"step": 4270,
"train_speed(iter/s)": 0.020015
},
{
"epoch": 1.769819913061478,
"grad_norm": 2.2593003701423755,
"learning_rate": 1.1867662753468517e-06,
"loss": 0.61837912,
"memory(GiB)": 48.58,
"step": 4275,
"train_speed(iter/s)": 0.020015
},
{
"epoch": 1.7718898778720762,
"grad_norm": 2.4545294029220797,
"learning_rate": 1.176093916755603e-06,
"loss": 0.61774817,
"memory(GiB)": 48.58,
"step": 4280,
"train_speed(iter/s)": 0.020016
},
{
"epoch": 1.7739598426826744,
"grad_norm": 1.9914997887032238,
"learning_rate": 1.1654215581643544e-06,
"loss": 0.58686681,
"memory(GiB)": 48.58,
"step": 4285,
"train_speed(iter/s)": 0.020016
},
{
"epoch": 1.7760298074932725,
"grad_norm": 2.0999532307692896,
"learning_rate": 1.1547491995731057e-06,
"loss": 0.60145164,
"memory(GiB)": 48.58,
"step": 4290,
"train_speed(iter/s)": 0.020017
},
{
"epoch": 1.7780997723038707,
"grad_norm": 2.480798626827661,
"learning_rate": 1.1440768409818571e-06,
"loss": 0.61948671,
"memory(GiB)": 48.58,
"step": 4295,
"train_speed(iter/s)": 0.020018
},
{
"epoch": 1.780169737114469,
"grad_norm": 2.1964203822462527,
"learning_rate": 1.1334044823906084e-06,
"loss": 0.5903161,
"memory(GiB)": 48.58,
"step": 4300,
"train_speed(iter/s)": 0.020018
},
{
"epoch": 1.7822397019250673,
"grad_norm": 2.2986740177948914,
"learning_rate": 1.1227321237993598e-06,
"loss": 0.60875359,
"memory(GiB)": 48.58,
"step": 4305,
"train_speed(iter/s)": 0.020019
},
{
"epoch": 1.7843096667356655,
"grad_norm": 1.994450780247921,
"learning_rate": 1.112059765208111e-06,
"loss": 0.61587105,
"memory(GiB)": 48.58,
"step": 4310,
"train_speed(iter/s)": 0.02002
},
{
"epoch": 1.7863796315462637,
"grad_norm": 2.096310617308162,
"learning_rate": 1.1013874066168623e-06,
"loss": 0.58633337,
"memory(GiB)": 48.58,
"step": 4315,
"train_speed(iter/s)": 0.02002
},
{
"epoch": 1.788449596356862,
"grad_norm": 1.9466363269467861,
"learning_rate": 1.0907150480256138e-06,
"loss": 0.60186481,
"memory(GiB)": 48.58,
"step": 4320,
"train_speed(iter/s)": 0.020021
},
{
"epoch": 1.7905195611674602,
"grad_norm": 2.308498919041305,
"learning_rate": 1.080042689434365e-06,
"loss": 0.59321814,
"memory(GiB)": 48.58,
"step": 4325,
"train_speed(iter/s)": 0.020022
},
{
"epoch": 1.7925895259780584,
"grad_norm": 2.6206625429667874,
"learning_rate": 1.0693703308431163e-06,
"loss": 0.61461964,
"memory(GiB)": 48.58,
"step": 4330,
"train_speed(iter/s)": 0.020022
},
{
"epoch": 1.7946594907886566,
"grad_norm": 2.4886244412597094,
"learning_rate": 1.0586979722518678e-06,
"loss": 0.61646094,
"memory(GiB)": 48.58,
"step": 4335,
"train_speed(iter/s)": 0.020022
},
{
"epoch": 1.7967294555992548,
"grad_norm": 1.9987554252944004,
"learning_rate": 1.0480256136606192e-06,
"loss": 0.60513401,
"memory(GiB)": 48.58,
"step": 4340,
"train_speed(iter/s)": 0.020023
},
{
"epoch": 1.798799420409853,
"grad_norm": 2.909678280995542,
"learning_rate": 1.0373532550693705e-06,
"loss": 0.58900928,
"memory(GiB)": 48.58,
"step": 4345,
"train_speed(iter/s)": 0.020023
},
{
"epoch": 1.8008693852204511,
"grad_norm": 2.232043207476952,
"learning_rate": 1.0266808964781217e-06,
"loss": 0.61423898,
"memory(GiB)": 48.58,
"step": 4350,
"train_speed(iter/s)": 0.020024
},
{
"epoch": 1.8029393500310493,
"grad_norm": 1.9536032593654695,
"learning_rate": 1.0160085378868732e-06,
"loss": 0.61663337,
"memory(GiB)": 48.58,
"step": 4355,
"train_speed(iter/s)": 0.020025
},
{
"epoch": 1.8050093148416477,
"grad_norm": 2.5224030653638048,
"learning_rate": 1.0053361792956244e-06,
"loss": 0.61187458,
"memory(GiB)": 48.58,
"step": 4360,
"train_speed(iter/s)": 0.020025
},
{
"epoch": 1.807079279652246,
"grad_norm": 1.7925531365832896,
"learning_rate": 9.946638207043757e-07,
"loss": 0.59958668,
"memory(GiB)": 48.58,
"step": 4365,
"train_speed(iter/s)": 0.020026
},
{
"epoch": 1.809149244462844,
"grad_norm": 2.000073758007796,
"learning_rate": 9.839914621131271e-07,
"loss": 0.62268171,
"memory(GiB)": 48.58,
"step": 4370,
"train_speed(iter/s)": 0.020027
},
{
"epoch": 1.8112192092734425,
"grad_norm": 1.8927383998351053,
"learning_rate": 9.733191035218784e-07,
"loss": 0.61844292,
"memory(GiB)": 48.58,
"step": 4375,
"train_speed(iter/s)": 0.020028
},
{
"epoch": 1.8132891740840407,
"grad_norm": 2.2603010222528708,
"learning_rate": 9.626467449306296e-07,
"loss": 0.60280704,
"memory(GiB)": 48.58,
"step": 4380,
"train_speed(iter/s)": 0.020028
},
{
"epoch": 1.8153591388946388,
"grad_norm": 2.0743626326418494,
"learning_rate": 9.519743863393811e-07,
"loss": 0.59184837,
"memory(GiB)": 48.58,
"step": 4385,
"train_speed(iter/s)": 0.020029
},
{
"epoch": 1.817429103705237,
"grad_norm": 2.5328803417607273,
"learning_rate": 9.413020277481325e-07,
"loss": 0.60617228,
"memory(GiB)": 48.58,
"step": 4390,
"train_speed(iter/s)": 0.020029
},
{
"epoch": 1.8194990685158352,
"grad_norm": 2.157648955819526,
"learning_rate": 9.306296691568837e-07,
"loss": 0.57980437,
"memory(GiB)": 48.58,
"step": 4395,
"train_speed(iter/s)": 0.020029
},
{
"epoch": 1.8215690333264334,
"grad_norm": 2.238654145184739,
"learning_rate": 9.199573105656351e-07,
"loss": 0.59073811,
"memory(GiB)": 48.58,
"step": 4400,
"train_speed(iter/s)": 0.02003
},
{
"epoch": 1.8236389981370316,
"grad_norm": 2.1560196555189677,
"learning_rate": 9.092849519743864e-07,
"loss": 0.60031624,
"memory(GiB)": 48.58,
"step": 4405,
"train_speed(iter/s)": 0.020031
},
{
"epoch": 1.8257089629476297,
"grad_norm": 1.9778064514427567,
"learning_rate": 8.986125933831377e-07,
"loss": 0.59476948,
"memory(GiB)": 48.58,
"step": 4410,
"train_speed(iter/s)": 0.020031
},
{
"epoch": 1.8277789277582281,
"grad_norm": 1.9062652252413357,
"learning_rate": 8.87940234791889e-07,
"loss": 0.60492353,
"memory(GiB)": 48.58,
"step": 4415,
"train_speed(iter/s)": 0.020032
},
{
"epoch": 1.8298488925688263,
"grad_norm": 2.20251758356039,
"learning_rate": 8.772678762006404e-07,
"loss": 0.57913284,
"memory(GiB)": 48.58,
"step": 4420,
"train_speed(iter/s)": 0.020033
},
{
"epoch": 1.8319188573794245,
"grad_norm": 2.374670539942745,
"learning_rate": 8.665955176093919e-07,
"loss": 0.61436529,
"memory(GiB)": 48.58,
"step": 4425,
"train_speed(iter/s)": 0.020033
},
{
"epoch": 1.833988822190023,
"grad_norm": 1.8014659618328237,
"learning_rate": 8.559231590181431e-07,
"loss": 0.58954339,
"memory(GiB)": 48.58,
"step": 4430,
"train_speed(iter/s)": 0.020034
},
{
"epoch": 1.836058787000621,
"grad_norm": 2.5328263785548706,
"learning_rate": 8.452508004268945e-07,
"loss": 0.62326274,
"memory(GiB)": 48.58,
"step": 4435,
"train_speed(iter/s)": 0.020034
},
{
"epoch": 1.8381287518112193,
"grad_norm": 2.201541673623922,
"learning_rate": 8.345784418356458e-07,
"loss": 0.61668777,
"memory(GiB)": 48.58,
"step": 4440,
"train_speed(iter/s)": 0.020035
},
{
"epoch": 1.8401987166218174,
"grad_norm": 1.9916726147288757,
"learning_rate": 8.239060832443971e-07,
"loss": 0.60296612,
"memory(GiB)": 48.58,
"step": 4445,
"train_speed(iter/s)": 0.020036
},
{
"epoch": 1.8422686814324156,
"grad_norm": 2.2611511913260167,
"learning_rate": 8.132337246531484e-07,
"loss": 0.59926748,
"memory(GiB)": 48.58,
"step": 4450,
"train_speed(iter/s)": 0.020037
},
{
"epoch": 1.8443386462430138,
"grad_norm": 1.8225104361277575,
"learning_rate": 8.025613660618997e-07,
"loss": 0.58590517,
"memory(GiB)": 48.58,
"step": 4455,
"train_speed(iter/s)": 0.020038
},
{
"epoch": 1.846408611053612,
"grad_norm": 2.350449897326385,
"learning_rate": 7.91889007470651e-07,
"loss": 0.60878654,
"memory(GiB)": 48.58,
"step": 4460,
"train_speed(iter/s)": 0.020038
},
{
"epoch": 1.8484785758642102,
"grad_norm": 2.4294512189314075,
"learning_rate": 7.812166488794024e-07,
"loss": 0.60740719,
"memory(GiB)": 48.58,
"step": 4465,
"train_speed(iter/s)": 0.020039
},
{
"epoch": 1.8505485406748086,
"grad_norm": 2.0078565564513413,
"learning_rate": 7.705442902881538e-07,
"loss": 0.61569843,
"memory(GiB)": 48.58,
"step": 4470,
"train_speed(iter/s)": 0.020039
},
{
"epoch": 1.8526185054854067,
"grad_norm": 2.416347350001525,
"learning_rate": 7.598719316969051e-07,
"loss": 0.59907522,
"memory(GiB)": 48.58,
"step": 4475,
"train_speed(iter/s)": 0.02004
},
{
"epoch": 1.854688470296005,
"grad_norm": 2.5546773910049523,
"learning_rate": 7.491995731056565e-07,
"loss": 0.58857327,
"memory(GiB)": 48.58,
"step": 4480,
"train_speed(iter/s)": 0.02004
},
{
"epoch": 1.8567584351066033,
"grad_norm": 2.1560857096799775,
"learning_rate": 7.385272145144078e-07,
"loss": 0.59972405,
"memory(GiB)": 48.58,
"step": 4485,
"train_speed(iter/s)": 0.020041
},
{
"epoch": 1.8588283999172015,
"grad_norm": 2.8045450674344155,
"learning_rate": 7.278548559231591e-07,
"loss": 0.62519865,
"memory(GiB)": 48.58,
"step": 4490,
"train_speed(iter/s)": 0.020042
},
{
"epoch": 1.8608983647277997,
"grad_norm": 1.88470841271018,
"learning_rate": 7.171824973319104e-07,
"loss": 0.59394321,
"memory(GiB)": 48.58,
"step": 4495,
"train_speed(iter/s)": 0.020042
},
{
"epoch": 1.8629683295383979,
"grad_norm": 1.900323108929672,
"learning_rate": 7.065101387406617e-07,
"loss": 0.59171925,
"memory(GiB)": 48.58,
"step": 4500,
"train_speed(iter/s)": 0.020043
},
{
"epoch": 1.8629683295383979,
"eval_loss": 0.7995001077651978,
"eval_runtime": 333.5022,
"eval_samples_per_second": 18.729,
"eval_steps_per_second": 1.172,
"step": 4500
},
{
"epoch": 1.865038294348996,
"grad_norm": 2.3418380686910854,
"learning_rate": 6.95837780149413e-07,
"loss": 0.62550597,
"memory(GiB)": 48.58,
"step": 4505,
"train_speed(iter/s)": 0.020009
},
{
"epoch": 1.8671082591595942,
"grad_norm": 2.2011802954910826,
"learning_rate": 6.851654215581644e-07,
"loss": 0.60017891,
"memory(GiB)": 48.58,
"step": 4510,
"train_speed(iter/s)": 0.02001
},
{
"epoch": 1.8691782239701924,
"grad_norm": 2.346777967941492,
"learning_rate": 6.744930629669158e-07,
"loss": 0.62277446,
"memory(GiB)": 48.58,
"step": 4515,
"train_speed(iter/s)": 0.020011
},
{
"epoch": 1.8712481887807906,
"grad_norm": 2.3039363321887385,
"learning_rate": 6.638207043756671e-07,
"loss": 0.63189201,
"memory(GiB)": 48.58,
"step": 4520,
"train_speed(iter/s)": 0.020012
},
{
"epoch": 1.873318153591389,
"grad_norm": 2.4254296862394753,
"learning_rate": 6.531483457844184e-07,
"loss": 0.59847736,
"memory(GiB)": 48.58,
"step": 4525,
"train_speed(iter/s)": 0.020012
},
{
"epoch": 1.8753881184019872,
"grad_norm": 2.520645231015452,
"learning_rate": 6.424759871931698e-07,
"loss": 0.60772429,
"memory(GiB)": 48.58,
"step": 4530,
"train_speed(iter/s)": 0.020013
},
{
"epoch": 1.8774580832125853,
"grad_norm": 2.14246492812266,
"learning_rate": 6.31803628601921e-07,
"loss": 0.61204777,
"memory(GiB)": 48.58,
"step": 4535,
"train_speed(iter/s)": 0.020014
},
{
"epoch": 1.8795280480231837,
"grad_norm": 1.9822250514442827,
"learning_rate": 6.211312700106724e-07,
"loss": 0.55947261,
"memory(GiB)": 48.58,
"step": 4540,
"train_speed(iter/s)": 0.020014
},
{
"epoch": 1.881598012833782,
"grad_norm": 2.052778758307509,
"learning_rate": 6.104589114194238e-07,
"loss": 0.60927758,
"memory(GiB)": 48.58,
"step": 4545,
"train_speed(iter/s)": 0.020015
},
{
"epoch": 1.88366797764438,
"grad_norm": 2.3908992672472222,
"learning_rate": 5.997865528281751e-07,
"loss": 0.61153603,
"memory(GiB)": 48.58,
"step": 4550,
"train_speed(iter/s)": 0.020016
},
{
"epoch": 1.8857379424549783,
"grad_norm": 1.9114509697538704,
"learning_rate": 5.891141942369264e-07,
"loss": 0.6099647,
"memory(GiB)": 48.58,
"step": 4555,
"train_speed(iter/s)": 0.020016
},
{
"epoch": 1.8878079072655765,
"grad_norm": 2.063913038194293,
"learning_rate": 5.784418356456777e-07,
"loss": 0.61249609,
"memory(GiB)": 48.58,
"step": 4560,
"train_speed(iter/s)": 0.020017
},
{
"epoch": 1.8898778720761746,
"grad_norm": 2.3274588748356404,
"learning_rate": 5.677694770544291e-07,
"loss": 0.60335112,
"memory(GiB)": 48.58,
"step": 4565,
"train_speed(iter/s)": 0.020018
},
{
"epoch": 1.8919478368867728,
"grad_norm": 2.5803919299014404,
"learning_rate": 5.570971184631804e-07,
"loss": 0.59715414,
"memory(GiB)": 48.58,
"step": 4570,
"train_speed(iter/s)": 0.020018
},
{
"epoch": 1.894017801697371,
"grad_norm": 2.0982349697996727,
"learning_rate": 5.464247598719318e-07,
"loss": 0.59020104,
"memory(GiB)": 48.58,
"step": 4575,
"train_speed(iter/s)": 0.020019
},
{
"epoch": 1.8960877665079694,
"grad_norm": 2.4156310349424492,
"learning_rate": 5.35752401280683e-07,
"loss": 0.58418913,
"memory(GiB)": 48.58,
"step": 4580,
"train_speed(iter/s)": 0.02002
},
{
"epoch": 1.8981577313185676,
"grad_norm": 2.1192384982329977,
"learning_rate": 5.250800426894344e-07,
"loss": 0.57976351,
"memory(GiB)": 48.58,
"step": 4585,
"train_speed(iter/s)": 0.02002
},
{
"epoch": 1.9002276961291658,
"grad_norm": 2.363144438078418,
"learning_rate": 5.144076840981858e-07,
"loss": 0.59181528,
"memory(GiB)": 48.58,
"step": 4590,
"train_speed(iter/s)": 0.020021
},
{
"epoch": 1.9022976609397642,
"grad_norm": 2.014916601780685,
"learning_rate": 5.037353255069371e-07,
"loss": 0.57993307,
"memory(GiB)": 48.58,
"step": 4595,
"train_speed(iter/s)": 0.020022
},
{
"epoch": 1.9043676257503623,
"grad_norm": 2.1798732452842615,
"learning_rate": 4.930629669156884e-07,
"loss": 0.61651163,
"memory(GiB)": 48.58,
"step": 4600,
"train_speed(iter/s)": 0.020022
},
{
"epoch": 1.9064375905609605,
"grad_norm": 2.4067029242954847,
"learning_rate": 4.823906083244397e-07,
"loss": 0.60284195,
"memory(GiB)": 48.58,
"step": 4605,
"train_speed(iter/s)": 0.020023
},
{
"epoch": 1.9085075553715587,
"grad_norm": 2.1464043153264587,
"learning_rate": 4.7171824973319113e-07,
"loss": 0.58217282,
"memory(GiB)": 48.58,
"step": 4610,
"train_speed(iter/s)": 0.020023
},
{
"epoch": 1.9105775201821569,
"grad_norm": 2.786031828765745,
"learning_rate": 4.6104589114194243e-07,
"loss": 0.59245019,
"memory(GiB)": 48.58,
"step": 4615,
"train_speed(iter/s)": 0.020024
},
{
"epoch": 1.912647484992755,
"grad_norm": 1.971002889872401,
"learning_rate": 4.5037353255069374e-07,
"loss": 0.55735373,
"memory(GiB)": 48.58,
"step": 4620,
"train_speed(iter/s)": 0.020025
},
{
"epoch": 1.9147174498033532,
"grad_norm": 2.6133112222197097,
"learning_rate": 4.3970117395944504e-07,
"loss": 0.58715906,
"memory(GiB)": 48.58,
"step": 4625,
"train_speed(iter/s)": 0.020025
},
{
"epoch": 1.9167874146139514,
"grad_norm": 2.2493774425876496,
"learning_rate": 4.290288153681964e-07,
"loss": 0.61737943,
"memory(GiB)": 48.58,
"step": 4630,
"train_speed(iter/s)": 0.020026
},
{
"epoch": 1.9188573794245498,
"grad_norm": 2.066157101262181,
"learning_rate": 4.1835645677694775e-07,
"loss": 0.58552856,
"memory(GiB)": 48.58,
"step": 4635,
"train_speed(iter/s)": 0.020027
},
{
"epoch": 1.920927344235148,
"grad_norm": 2.2279403099524164,
"learning_rate": 4.076840981856991e-07,
"loss": 0.60430613,
"memory(GiB)": 48.58,
"step": 4640,
"train_speed(iter/s)": 0.020027
},
{
"epoch": 1.9229973090457462,
"grad_norm": 2.21706254471347,
"learning_rate": 3.970117395944504e-07,
"loss": 0.60209589,
"memory(GiB)": 48.58,
"step": 4645,
"train_speed(iter/s)": 0.020027
},
{
"epoch": 1.9250672738563446,
"grad_norm": 1.8175472634449323,
"learning_rate": 3.863393810032017e-07,
"loss": 0.57723808,
"memory(GiB)": 48.58,
"step": 4650,
"train_speed(iter/s)": 0.020028
},
{
"epoch": 1.9271372386669428,
"grad_norm": 2.013736958312471,
"learning_rate": 3.756670224119531e-07,
"loss": 0.59187717,
"memory(GiB)": 48.58,
"step": 4655,
"train_speed(iter/s)": 0.020028
},
{
"epoch": 1.929207203477541,
"grad_norm": 2.9147581447233883,
"learning_rate": 3.649946638207044e-07,
"loss": 0.62887087,
"memory(GiB)": 48.58,
"step": 4660,
"train_speed(iter/s)": 0.020029
},
{
"epoch": 1.9312771682881391,
"grad_norm": 2.1113909828567605,
"learning_rate": 3.5432230522945573e-07,
"loss": 0.60369582,
"memory(GiB)": 48.58,
"step": 4665,
"train_speed(iter/s)": 0.02003
},
{
"epoch": 1.9333471330987373,
"grad_norm": 2.42442448807154,
"learning_rate": 3.4364994663820703e-07,
"loss": 0.62186384,
"memory(GiB)": 48.58,
"step": 4670,
"train_speed(iter/s)": 0.02003
},
{
"epoch": 1.9354170979093355,
"grad_norm": 2.380814562965916,
"learning_rate": 3.329775880469584e-07,
"loss": 0.59275131,
"memory(GiB)": 48.58,
"step": 4675,
"train_speed(iter/s)": 0.020031
},
{
"epoch": 1.9374870627199337,
"grad_norm": 2.6584440377132363,
"learning_rate": 3.2230522945570974e-07,
"loss": 0.60647793,
"memory(GiB)": 48.58,
"step": 4680,
"train_speed(iter/s)": 0.020031
},
{
"epoch": 1.9395570275305318,
"grad_norm": 1.9567852417003078,
"learning_rate": 3.116328708644611e-07,
"loss": 0.60262537,
"memory(GiB)": 48.58,
"step": 4685,
"train_speed(iter/s)": 0.020032
},
{
"epoch": 1.9416269923411302,
"grad_norm": 2.3501175824898266,
"learning_rate": 3.009605122732124e-07,
"loss": 0.58797078,
"memory(GiB)": 48.58,
"step": 4690,
"train_speed(iter/s)": 0.020032
},
{
"epoch": 1.9436969571517284,
"grad_norm": 2.237666059037871,
"learning_rate": 2.9028815368196376e-07,
"loss": 0.60981102,
"memory(GiB)": 48.58,
"step": 4695,
"train_speed(iter/s)": 0.020033
},
{
"epoch": 1.9457669219623266,
"grad_norm": 2.4233069808853993,
"learning_rate": 2.7961579509071506e-07,
"loss": 0.60726156,
"memory(GiB)": 48.58,
"step": 4700,
"train_speed(iter/s)": 0.020033
},
{
"epoch": 1.947836886772925,
"grad_norm": 2.260763775980905,
"learning_rate": 2.689434364994664e-07,
"loss": 0.60588284,
"memory(GiB)": 48.58,
"step": 4705,
"train_speed(iter/s)": 0.020034
},
{
"epoch": 1.9499068515835232,
"grad_norm": 2.5081221538985625,
"learning_rate": 2.582710779082177e-07,
"loss": 0.59843764,
"memory(GiB)": 48.58,
"step": 4710,
"train_speed(iter/s)": 0.020034
},
{
"epoch": 1.9519768163941214,
"grad_norm": 2.3881113527972304,
"learning_rate": 2.475987193169691e-07,
"loss": 0.58006935,
"memory(GiB)": 48.58,
"step": 4715,
"train_speed(iter/s)": 0.020035
},
{
"epoch": 1.9540467812047195,
"grad_norm": 2.0678535427683564,
"learning_rate": 2.369263607257204e-07,
"loss": 0.60157442,
"memory(GiB)": 48.58,
"step": 4720,
"train_speed(iter/s)": 0.020036
},
{
"epoch": 1.9561167460153177,
"grad_norm": 2.438506236548372,
"learning_rate": 2.2625400213447176e-07,
"loss": 0.61028309,
"memory(GiB)": 48.58,
"step": 4725,
"train_speed(iter/s)": 0.020036
},
{
"epoch": 1.9581867108259159,
"grad_norm": 2.0110599859655482,
"learning_rate": 2.1558164354322307e-07,
"loss": 0.59143724,
"memory(GiB)": 48.58,
"step": 4730,
"train_speed(iter/s)": 0.020037
},
{
"epoch": 1.960256675636514,
"grad_norm": 2.0688837840068124,
"learning_rate": 2.049092849519744e-07,
"loss": 0.60122604,
"memory(GiB)": 48.58,
"step": 4735,
"train_speed(iter/s)": 0.020037
},
{
"epoch": 1.9623266404471122,
"grad_norm": 2.1607611011042964,
"learning_rate": 1.9423692636072575e-07,
"loss": 0.59857554,
"memory(GiB)": 48.58,
"step": 4740,
"train_speed(iter/s)": 0.020038
},
{
"epoch": 1.9643966052577106,
"grad_norm": 1.9355873743150782,
"learning_rate": 1.8356456776947706e-07,
"loss": 0.58816404,
"memory(GiB)": 48.58,
"step": 4745,
"train_speed(iter/s)": 0.020038
},
{
"epoch": 1.9664665700683088,
"grad_norm": 2.1028560489657915,
"learning_rate": 1.728922091782284e-07,
"loss": 0.59075899,
"memory(GiB)": 48.58,
"step": 4750,
"train_speed(iter/s)": 0.020039
},
{
"epoch": 1.968536534878907,
"grad_norm": 1.9716163219927025,
"learning_rate": 1.6221985058697972e-07,
"loss": 0.58181,
"memory(GiB)": 48.58,
"step": 4755,
"train_speed(iter/s)": 0.020039
},
{
"epoch": 1.9706064996895054,
"grad_norm": 2.243337925247892,
"learning_rate": 1.5154749199573107e-07,
"loss": 0.58783703,
"memory(GiB)": 48.58,
"step": 4760,
"train_speed(iter/s)": 0.02004
},
{
"epoch": 1.9726764645001036,
"grad_norm": 2.4859440712948166,
"learning_rate": 1.408751334044824e-07,
"loss": 0.60655708,
"memory(GiB)": 48.58,
"step": 4765,
"train_speed(iter/s)": 0.02004
},
{
"epoch": 1.9747464293107018,
"grad_norm": 1.9831273691126385,
"learning_rate": 1.3020277481323373e-07,
"loss": 0.61188507,
"memory(GiB)": 48.58,
"step": 4770,
"train_speed(iter/s)": 0.020041
},
{
"epoch": 1.9768163941213,
"grad_norm": 2.2314049587449807,
"learning_rate": 1.1953041622198506e-07,
"loss": 0.58718634,
"memory(GiB)": 48.58,
"step": 4775,
"train_speed(iter/s)": 0.020041
},
{
"epoch": 1.9788863589318981,
"grad_norm": 1.8150041254780722,
"learning_rate": 1.088580576307364e-07,
"loss": 0.58888893,
"memory(GiB)": 48.58,
"step": 4780,
"train_speed(iter/s)": 0.020042
},
{
"epoch": 1.9809563237424963,
"grad_norm": 2.247396093692947,
"learning_rate": 9.818569903948773e-08,
"loss": 0.58732767,
"memory(GiB)": 48.58,
"step": 4785,
"train_speed(iter/s)": 0.020042
},
{
"epoch": 1.9830262885530945,
"grad_norm": 2.2098523979831644,
"learning_rate": 8.751334044823908e-08,
"loss": 0.56731772,
"memory(GiB)": 48.58,
"step": 4790,
"train_speed(iter/s)": 0.020043
},
{
"epoch": 1.9850962533636927,
"grad_norm": 2.097191818370972,
"learning_rate": 7.68409818569904e-08,
"loss": 0.60409393,
"memory(GiB)": 48.58,
"step": 4795,
"train_speed(iter/s)": 0.020043
},
{
"epoch": 1.987166218174291,
"grad_norm": 2.228865758940409,
"learning_rate": 6.616862326574174e-08,
"loss": 0.58583736,
"memory(GiB)": 48.58,
"step": 4800,
"train_speed(iter/s)": 0.020043
},
{
"epoch": 1.987166218174291,
"eval_loss": 0.7990086674690247,
"eval_runtime": 335.524,
"eval_samples_per_second": 18.616,
"eval_steps_per_second": 1.165,
"step": 4800
},
{
"epoch": 1.9892361829848892,
"grad_norm": 2.144126117644109,
"learning_rate": 5.5496264674493065e-08,
"loss": 0.59295273,
"memory(GiB)": 48.58,
"step": 4805,
"train_speed(iter/s)": 0.020012
},
{
"epoch": 1.9913061477954874,
"grad_norm": 2.020692421672203,
"learning_rate": 4.48239060832444e-08,
"loss": 0.58215327,
"memory(GiB)": 48.58,
"step": 4810,
"train_speed(iter/s)": 0.020012
},
{
"epoch": 1.9933761126060858,
"grad_norm": 2.560575104653987,
"learning_rate": 3.415154749199574e-08,
"loss": 0.59236603,
"memory(GiB)": 48.58,
"step": 4815,
"train_speed(iter/s)": 0.020013
},
{
"epoch": 1.995446077416684,
"grad_norm": 2.242799455811118,
"learning_rate": 2.347918890074707e-08,
"loss": 0.59284697,
"memory(GiB)": 48.58,
"step": 4820,
"train_speed(iter/s)": 0.020013
},
{
"epoch": 1.9975160422272822,
"grad_norm": 2.496931570288639,
"learning_rate": 1.28068303094984e-08,
"loss": 0.58884125,
"memory(GiB)": 48.58,
"step": 4825,
"train_speed(iter/s)": 0.020014
},
{
"epoch": 1.9995860070378804,
"grad_norm": 2.1504541545847133,
"learning_rate": 2.1344717182497336e-09,
"loss": 0.57651815,
"memory(GiB)": 48.58,
"step": 4830,
"train_speed(iter/s)": 0.020014
},
{
"epoch": 1.9995860070378804,
"eval_loss": 0.7990483641624451,
"eval_runtime": 333.2574,
"eval_samples_per_second": 18.742,
"eval_steps_per_second": 1.173,
"step": 4830
}
],
"logging_steps": 5,
"max_steps": 4830,
"num_input_tokens_seen": 0,
"num_train_epochs": 2,
"save_steps": 300,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 1.0363792423256064e+16,
"train_batch_size": 2,
"trial_name": null,
"trial_params": null
}