{ "best_metric": null, "best_model_checkpoint": null, "epoch": 3.9324922169424874, "eval_steps": 500, "global_step": 3000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.01, "grad_norm": 2.323194980621338, "learning_rate": 4.9999946882250004e-05, "loss": 0.2105, "step": 5 }, { "epoch": 0.01, "grad_norm": 3.090125799179077, "learning_rate": 4.999978752922572e-05, "loss": 0.2656, "step": 10 }, { "epoch": 0.02, "grad_norm": 2.8763957023620605, "learning_rate": 4.999952194160431e-05, "loss": 0.3075, "step": 15 }, { "epoch": 0.03, "grad_norm": 2.714475631713867, "learning_rate": 4.999915012051437e-05, "loss": 0.3164, "step": 20 }, { "epoch": 0.03, "grad_norm": 2.6384477615356445, "learning_rate": 4.999867206753593e-05, "loss": 0.3167, "step": 25 }, { "epoch": 0.04, "grad_norm": 3.147310733795166, "learning_rate": 4.9998087784700426e-05, "loss": 0.3379, "step": 30 }, { "epoch": 0.05, "grad_norm": 2.4968576431274414, "learning_rate": 4.9997397274490725e-05, "loss": 0.3289, "step": 35 }, { "epoch": 0.05, "grad_norm": 2.5513858795166016, "learning_rate": 4.9996600539841096e-05, "loss": 0.3304, "step": 40 }, { "epoch": 0.06, "grad_norm": 2.6501898765563965, "learning_rate": 4.99956975841372e-05, "loss": 0.3316, "step": 45 }, { "epoch": 0.07, "grad_norm": 3.1661622524261475, "learning_rate": 4.9994688411216076e-05, "loss": 0.3352, "step": 50 }, { "epoch": 0.07, "grad_norm": 3.1451425552368164, "learning_rate": 4.9993573025366124e-05, "loss": 0.3337, "step": 55 }, { "epoch": 0.08, "grad_norm": 3.058828115463257, "learning_rate": 4.999235143132708e-05, "loss": 0.3283, "step": 60 }, { "epoch": 0.09, "grad_norm": 2.6413097381591797, "learning_rate": 4.999102363429002e-05, "loss": 0.312, "step": 65 }, { "epoch": 0.09, "grad_norm": 2.8983662128448486, "learning_rate": 4.99895896398973e-05, "loss": 0.3184, "step": 70 }, { "epoch": 0.1, "grad_norm": 2.6056504249572754, "learning_rate": 4.998804945424258e-05, "loss": 0.3464, "step": 75 }, { "epoch": 0.1, "grad_norm": 2.5964763164520264, "learning_rate": 4.998640308387074e-05, "loss": 0.335, "step": 80 }, { "epoch": 0.11, "grad_norm": 2.594703435897827, "learning_rate": 4.9984650535777896e-05, "loss": 0.3487, "step": 85 }, { "epoch": 0.12, "grad_norm": 2.996779203414917, "learning_rate": 4.9982791817411386e-05, "loss": 0.346, "step": 90 }, { "epoch": 0.12, "grad_norm": 2.9310555458068848, "learning_rate": 4.998082693666966e-05, "loss": 0.3203, "step": 95 }, { "epoch": 0.13, "grad_norm": 3.1505908966064453, "learning_rate": 4.997875590190233e-05, "loss": 0.3766, "step": 100 }, { "epoch": 0.14, "grad_norm": 2.8678457736968994, "learning_rate": 4.9976578721910106e-05, "loss": 0.3404, "step": 105 }, { "epoch": 0.14, "grad_norm": 2.8068525791168213, "learning_rate": 4.9974295405944714e-05, "loss": 0.3249, "step": 110 }, { "epoch": 0.15, "grad_norm": 2.8355133533477783, "learning_rate": 4.9971905963708946e-05, "loss": 0.3226, "step": 115 }, { "epoch": 0.16, "grad_norm": 3.0205864906311035, "learning_rate": 4.996941040535653e-05, "loss": 0.3613, "step": 120 }, { "epoch": 0.16, "grad_norm": 2.9807019233703613, "learning_rate": 4.9966808741492153e-05, "loss": 0.3284, "step": 125 }, { "epoch": 0.17, "grad_norm": 3.070850372314453, "learning_rate": 4.996410098317137e-05, "loss": 0.3217, "step": 130 }, { "epoch": 0.18, "grad_norm": 3.1543736457824707, "learning_rate": 4.996128714190058e-05, "loss": 0.3636, "step": 135 }, { "epoch": 0.18, "grad_norm": 2.80784273147583, "learning_rate": 4.995836722963699e-05, "loss": 0.3379, "step": 140 }, { "epoch": 0.19, "grad_norm": 2.6561925411224365, "learning_rate": 4.9955341258788526e-05, "loss": 0.3442, "step": 145 }, { "epoch": 0.2, "grad_norm": 2.814857006072998, "learning_rate": 4.99522092422138e-05, "loss": 0.3456, "step": 150 }, { "epoch": 0.2, "grad_norm": 2.924438238143921, "learning_rate": 4.9948971193222086e-05, "loss": 0.3436, "step": 155 }, { "epoch": 0.21, "grad_norm": 2.7895913124084473, "learning_rate": 4.994562712557319e-05, "loss": 0.3319, "step": 160 }, { "epoch": 0.22, "grad_norm": 2.878596544265747, "learning_rate": 4.9942177053477474e-05, "loss": 0.342, "step": 165 }, { "epoch": 0.22, "grad_norm": 3.0157933235168457, "learning_rate": 4.993862099159574e-05, "loss": 0.3335, "step": 170 }, { "epoch": 0.23, "grad_norm": 2.585909366607666, "learning_rate": 4.99349589550392e-05, "loss": 0.3373, "step": 175 }, { "epoch": 0.24, "grad_norm": 2.992539167404175, "learning_rate": 4.993119095936937e-05, "loss": 0.3318, "step": 180 }, { "epoch": 0.24, "grad_norm": 2.995790719985962, "learning_rate": 4.992731702059805e-05, "loss": 0.3289, "step": 185 }, { "epoch": 0.25, "grad_norm": 3.0879945755004883, "learning_rate": 4.9923337155187235e-05, "loss": 0.3309, "step": 190 }, { "epoch": 0.26, "grad_norm": 2.9949581623077393, "learning_rate": 4.991925138004905e-05, "loss": 0.3471, "step": 195 }, { "epoch": 0.26, "grad_norm": 2.873623847961426, "learning_rate": 4.991505971254566e-05, "loss": 0.3463, "step": 200 }, { "epoch": 0.27, "grad_norm": 3.0350987911224365, "learning_rate": 4.9910762170489226e-05, "loss": 0.33, "step": 205 }, { "epoch": 0.28, "grad_norm": 2.794652223587036, "learning_rate": 4.99063587721418e-05, "loss": 0.3723, "step": 210 }, { "epoch": 0.28, "grad_norm": 2.4460439682006836, "learning_rate": 4.990184953621528e-05, "loss": 0.3512, "step": 215 }, { "epoch": 0.29, "grad_norm": 2.772210121154785, "learning_rate": 4.989723448187131e-05, "loss": 0.3232, "step": 220 }, { "epoch": 0.29, "grad_norm": 3.0419492721557617, "learning_rate": 4.989251362872119e-05, "loss": 0.3364, "step": 225 }, { "epoch": 0.3, "grad_norm": 3.0731663703918457, "learning_rate": 4.988768699682579e-05, "loss": 0.355, "step": 230 }, { "epoch": 0.31, "grad_norm": 2.81030535697937, "learning_rate": 4.9882754606695524e-05, "loss": 0.3158, "step": 235 }, { "epoch": 0.31, "grad_norm": 2.7734286785125732, "learning_rate": 4.9877716479290174e-05, "loss": 0.3286, "step": 240 }, { "epoch": 0.32, "grad_norm": 2.918984889984131, "learning_rate": 4.987257263601885e-05, "loss": 0.3314, "step": 245 }, { "epoch": 0.33, "grad_norm": 2.953944206237793, "learning_rate": 4.986732309873992e-05, "loss": 0.3179, "step": 250 }, { "epoch": 0.33, "grad_norm": 2.8582749366760254, "learning_rate": 4.986196788976086e-05, "loss": 0.3238, "step": 255 }, { "epoch": 0.34, "grad_norm": 2.757632255554199, "learning_rate": 4.985650703183822e-05, "loss": 0.3413, "step": 260 }, { "epoch": 0.35, "grad_norm": 2.9017035961151123, "learning_rate": 4.985094054817746e-05, "loss": 0.3335, "step": 265 }, { "epoch": 0.35, "grad_norm": 3.1360483169555664, "learning_rate": 4.9845268462432916e-05, "loss": 0.3474, "step": 270 }, { "epoch": 0.36, "grad_norm": 2.847700834274292, "learning_rate": 4.983949079870765e-05, "loss": 0.3471, "step": 275 }, { "epoch": 0.37, "grad_norm": 2.8749804496765137, "learning_rate": 4.983360758155341e-05, "loss": 0.3389, "step": 280 }, { "epoch": 0.37, "grad_norm": 2.9671127796173096, "learning_rate": 4.9827618835970426e-05, "loss": 0.3379, "step": 285 }, { "epoch": 0.38, "grad_norm": 2.869534730911255, "learning_rate": 4.982152458740741e-05, "loss": 0.328, "step": 290 }, { "epoch": 0.39, "grad_norm": 2.9593312740325928, "learning_rate": 4.981532486176138e-05, "loss": 0.348, "step": 295 }, { "epoch": 0.39, "grad_norm": 3.288499593734741, "learning_rate": 4.980901968537758e-05, "loss": 0.3691, "step": 300 }, { "epoch": 0.4, "grad_norm": 3.329684257507324, "learning_rate": 4.980260908504934e-05, "loss": 0.3426, "step": 305 }, { "epoch": 0.41, "grad_norm": 2.8230020999908447, "learning_rate": 4.9796093088018e-05, "loss": 0.3367, "step": 310 }, { "epoch": 0.41, "grad_norm": 2.743234157562256, "learning_rate": 4.978947172197277e-05, "loss": 0.3594, "step": 315 }, { "epoch": 0.42, "grad_norm": 2.874333620071411, "learning_rate": 4.978274501505061e-05, "loss": 0.3394, "step": 320 }, { "epoch": 0.43, "grad_norm": 3.2279603481292725, "learning_rate": 4.9775912995836136e-05, "loss": 0.3307, "step": 325 }, { "epoch": 0.43, "grad_norm": 2.608811378479004, "learning_rate": 4.9768975693361454e-05, "loss": 0.3431, "step": 330 }, { "epoch": 0.44, "grad_norm": 2.868130683898926, "learning_rate": 4.976193313710608e-05, "loss": 0.3273, "step": 335 }, { "epoch": 0.45, "grad_norm": 3.2972702980041504, "learning_rate": 4.9754785356996787e-05, "loss": 0.3453, "step": 340 }, { "epoch": 0.45, "grad_norm": 3.2560746669769287, "learning_rate": 4.9747532383407504e-05, "loss": 0.3831, "step": 345 }, { "epoch": 0.46, "grad_norm": 3.038130760192871, "learning_rate": 4.9740174247159156e-05, "loss": 0.3916, "step": 350 }, { "epoch": 0.47, "grad_norm": 3.453185796737671, "learning_rate": 4.973271097951956e-05, "loss": 0.3661, "step": 355 }, { "epoch": 0.47, "grad_norm": 2.8996922969818115, "learning_rate": 4.9725142612203265e-05, "loss": 0.3685, "step": 360 }, { "epoch": 0.48, "grad_norm": 3.0452466011047363, "learning_rate": 4.971746917737146e-05, "loss": 0.3723, "step": 365 }, { "epoch": 0.49, "grad_norm": 3.064406156539917, "learning_rate": 4.970969070763177e-05, "loss": 0.4086, "step": 370 }, { "epoch": 0.49, "grad_norm": 3.1569948196411133, "learning_rate": 4.9701807236038204e-05, "loss": 0.4095, "step": 375 }, { "epoch": 0.5, "grad_norm": 3.0515644550323486, "learning_rate": 4.9693818796090927e-05, "loss": 0.4156, "step": 380 }, { "epoch": 0.5, "grad_norm": 3.2472422122955322, "learning_rate": 4.968572542173617e-05, "loss": 0.4684, "step": 385 }, { "epoch": 0.51, "grad_norm": 3.417625904083252, "learning_rate": 4.96775271473661e-05, "loss": 0.4493, "step": 390 }, { "epoch": 0.52, "grad_norm": 3.272538185119629, "learning_rate": 4.9669224007818623e-05, "loss": 0.4514, "step": 395 }, { "epoch": 0.52, "grad_norm": 2.8853087425231934, "learning_rate": 4.966081603837725e-05, "loss": 0.4629, "step": 400 }, { "epoch": 0.53, "grad_norm": 3.1684935092926025, "learning_rate": 4.965230327477099e-05, "loss": 0.4347, "step": 405 }, { "epoch": 0.54, "grad_norm": 3.0273630619049072, "learning_rate": 4.964368575317415e-05, "loss": 0.4532, "step": 410 }, { "epoch": 0.54, "grad_norm": 2.8964247703552246, "learning_rate": 4.963496351020619e-05, "loss": 0.4514, "step": 415 }, { "epoch": 0.55, "grad_norm": 3.0990092754364014, "learning_rate": 4.962613658293158e-05, "loss": 0.4611, "step": 420 }, { "epoch": 0.56, "grad_norm": 3.376248836517334, "learning_rate": 4.961720500885967e-05, "loss": 0.4585, "step": 425 }, { "epoch": 0.56, "grad_norm": 3.2961933612823486, "learning_rate": 4.960816882594443e-05, "loss": 0.4574, "step": 430 }, { "epoch": 0.57, "grad_norm": 3.159632682800293, "learning_rate": 4.959902807258443e-05, "loss": 0.4567, "step": 435 }, { "epoch": 0.58, "grad_norm": 3.271243095397949, "learning_rate": 4.958978278762255e-05, "loss": 0.4709, "step": 440 }, { "epoch": 0.58, "grad_norm": 2.813108205795288, "learning_rate": 4.958043301034589e-05, "loss": 0.477, "step": 445 }, { "epoch": 0.59, "grad_norm": 3.1648154258728027, "learning_rate": 4.95709787804856e-05, "loss": 0.4489, "step": 450 }, { "epoch": 0.6, "grad_norm": 3.2871389389038086, "learning_rate": 4.9561420138216645e-05, "loss": 0.4604, "step": 455 }, { "epoch": 0.6, "grad_norm": 3.215829849243164, "learning_rate": 4.955175712415773e-05, "loss": 0.4703, "step": 460 }, { "epoch": 0.61, "grad_norm": 3.0727405548095703, "learning_rate": 4.954198977937106e-05, "loss": 0.4745, "step": 465 }, { "epoch": 0.62, "grad_norm": 3.3414416313171387, "learning_rate": 4.953211814536217e-05, "loss": 0.4481, "step": 470 }, { "epoch": 0.62, "grad_norm": 3.058262348175049, "learning_rate": 4.9522142264079794e-05, "loss": 0.4765, "step": 475 }, { "epoch": 0.63, "grad_norm": 2.8146088123321533, "learning_rate": 4.951206217791564e-05, "loss": 0.4682, "step": 480 }, { "epoch": 0.64, "grad_norm": 3.241665840148926, "learning_rate": 4.9501877929704215e-05, "loss": 0.4803, "step": 485 }, { "epoch": 0.64, "grad_norm": 3.362031936645508, "learning_rate": 4.949158956272268e-05, "loss": 0.5213, "step": 490 }, { "epoch": 0.65, "grad_norm": 3.4168310165405273, "learning_rate": 4.948119712069062e-05, "loss": 0.5243, "step": 495 }, { "epoch": 0.66, "grad_norm": 3.469191312789917, "learning_rate": 4.9470700647769904e-05, "loss": 0.5824, "step": 500 }, { "epoch": 0.66, "grad_norm": 3.592074394226074, "learning_rate": 4.9460100188564426e-05, "loss": 0.5777, "step": 505 }, { "epoch": 0.67, "grad_norm": 3.4984512329101562, "learning_rate": 4.944939578812001e-05, "loss": 0.6011, "step": 510 }, { "epoch": 0.68, "grad_norm": 3.6021430492401123, "learning_rate": 4.943858749192414e-05, "loss": 0.6145, "step": 515 }, { "epoch": 0.68, "grad_norm": 3.7211990356445312, "learning_rate": 4.942767534590581e-05, "loss": 0.6159, "step": 520 }, { "epoch": 0.69, "grad_norm": 3.5600759983062744, "learning_rate": 4.9416659396435304e-05, "loss": 0.5823, "step": 525 }, { "epoch": 0.69, "grad_norm": 3.6605124473571777, "learning_rate": 4.940553969032403e-05, "loss": 0.6421, "step": 530 }, { "epoch": 0.7, "grad_norm": 3.654963493347168, "learning_rate": 4.9394316274824284e-05, "loss": 0.6296, "step": 535 }, { "epoch": 0.71, "grad_norm": 3.279911518096924, "learning_rate": 4.938298919762907e-05, "loss": 0.6206, "step": 540 }, { "epoch": 0.71, "grad_norm": 3.3684518337249756, "learning_rate": 4.9371558506871893e-05, "loss": 0.618, "step": 545 }, { "epoch": 0.72, "grad_norm": 3.6144015789031982, "learning_rate": 4.936002425112657e-05, "loss": 0.6063, "step": 550 }, { "epoch": 0.73, "grad_norm": 3.5420987606048584, "learning_rate": 4.934838647940699e-05, "loss": 0.6417, "step": 555 }, { "epoch": 0.73, "grad_norm": 3.549086570739746, "learning_rate": 4.933664524116694e-05, "loss": 0.6196, "step": 560 }, { "epoch": 0.74, "grad_norm": 3.43263840675354, "learning_rate": 4.9324800586299854e-05, "loss": 0.6224, "step": 565 }, { "epoch": 0.75, "grad_norm": 3.745143175125122, "learning_rate": 4.931285256513868e-05, "loss": 0.6052, "step": 570 }, { "epoch": 0.75, "grad_norm": 3.4879541397094727, "learning_rate": 4.9300801228455536e-05, "loss": 0.6168, "step": 575 }, { "epoch": 0.76, "grad_norm": 3.449970245361328, "learning_rate": 4.9288646627461645e-05, "loss": 0.6278, "step": 580 }, { "epoch": 0.77, "grad_norm": 3.4467875957489014, "learning_rate": 4.9276388813807e-05, "loss": 0.5972, "step": 585 }, { "epoch": 0.77, "grad_norm": 3.513766288757324, "learning_rate": 4.92640278395802e-05, "loss": 0.6289, "step": 590 }, { "epoch": 0.78, "grad_norm": 3.6418793201446533, "learning_rate": 4.925156375730822e-05, "loss": 0.6228, "step": 595 }, { "epoch": 0.79, "grad_norm": 3.486773729324341, "learning_rate": 4.923899661995617e-05, "loss": 0.5931, "step": 600 }, { "epoch": 0.79, "grad_norm": 3.476663112640381, "learning_rate": 4.92263264809271e-05, "loss": 0.615, "step": 605 }, { "epoch": 0.8, "grad_norm": 3.6010472774505615, "learning_rate": 4.9213553394061754e-05, "loss": 0.6221, "step": 610 }, { "epoch": 0.81, "grad_norm": 3.489880323410034, "learning_rate": 4.920067741363835e-05, "loss": 0.6008, "step": 615 }, { "epoch": 0.81, "grad_norm": 3.6153078079223633, "learning_rate": 4.918769859437232e-05, "loss": 0.6362, "step": 620 }, { "epoch": 0.82, "grad_norm": 3.6381516456604004, "learning_rate": 4.9174616991416136e-05, "loss": 0.6391, "step": 625 }, { "epoch": 0.83, "grad_norm": 3.5741944313049316, "learning_rate": 4.916143266035901e-05, "loss": 0.617, "step": 630 }, { "epoch": 0.83, "grad_norm": 3.613692283630371, "learning_rate": 4.914814565722671e-05, "loss": 0.628, "step": 635 }, { "epoch": 0.84, "grad_norm": 3.7904045581817627, "learning_rate": 4.913475603848129e-05, "loss": 0.6157, "step": 640 }, { "epoch": 0.85, "grad_norm": 3.4816792011260986, "learning_rate": 4.912126386102086e-05, "loss": 0.6457, "step": 645 }, { "epoch": 0.85, "grad_norm": 3.371680974960327, "learning_rate": 4.910766918217935e-05, "loss": 0.6304, "step": 650 }, { "epoch": 0.86, "grad_norm": 3.502263069152832, "learning_rate": 4.909397205972627e-05, "loss": 0.6057, "step": 655 }, { "epoch": 0.87, "grad_norm": 3.6000306606292725, "learning_rate": 4.908017255186643e-05, "loss": 0.6629, "step": 660 }, { "epoch": 0.87, "grad_norm": 3.747457265853882, "learning_rate": 4.906627071723975e-05, "loss": 0.659, "step": 665 }, { "epoch": 0.88, "grad_norm": 3.356635808944702, "learning_rate": 4.905226661492095e-05, "loss": 0.6263, "step": 670 }, { "epoch": 0.88, "grad_norm": 3.6806087493896484, "learning_rate": 4.903816030441935e-05, "loss": 0.6128, "step": 675 }, { "epoch": 0.89, "grad_norm": 3.5806305408477783, "learning_rate": 4.902395184567859e-05, "loss": 0.6538, "step": 680 }, { "epoch": 0.9, "grad_norm": 3.4562089443206787, "learning_rate": 4.900964129907638e-05, "loss": 0.6271, "step": 685 }, { "epoch": 0.9, "grad_norm": 3.561217784881592, "learning_rate": 4.8995228725424235e-05, "loss": 0.6683, "step": 690 }, { "epoch": 0.91, "grad_norm": 3.419334650039673, "learning_rate": 4.898071418596724e-05, "loss": 0.6503, "step": 695 }, { "epoch": 0.92, "grad_norm": 3.7222039699554443, "learning_rate": 4.8966097742383765e-05, "loss": 0.6211, "step": 700 }, { "epoch": 0.92, "grad_norm": 3.767538070678711, "learning_rate": 4.895137945678522e-05, "loss": 0.6252, "step": 705 }, { "epoch": 0.93, "grad_norm": 3.1880886554718018, "learning_rate": 4.893655939171578e-05, "loss": 0.6403, "step": 710 }, { "epoch": 0.94, "grad_norm": 3.655524492263794, "learning_rate": 4.892163761015214e-05, "loss": 0.6344, "step": 715 }, { "epoch": 0.94, "grad_norm": 3.342782735824585, "learning_rate": 4.890661417550319e-05, "loss": 0.6339, "step": 720 }, { "epoch": 0.95, "grad_norm": 3.661858320236206, "learning_rate": 4.889148915160984e-05, "loss": 0.6554, "step": 725 }, { "epoch": 0.96, "grad_norm": 3.906249761581421, "learning_rate": 4.887626260274465e-05, "loss": 0.6478, "step": 730 }, { "epoch": 0.96, "grad_norm": 3.5664069652557373, "learning_rate": 4.886093459361163e-05, "loss": 0.652, "step": 735 }, { "epoch": 0.97, "grad_norm": 3.413325786590576, "learning_rate": 4.8845505189345934e-05, "loss": 0.6491, "step": 740 }, { "epoch": 0.98, "grad_norm": 3.2417361736297607, "learning_rate": 4.8829974455513564e-05, "loss": 0.6344, "step": 745 }, { "epoch": 0.98, "grad_norm": 4.055202960968018, "learning_rate": 4.881434245811115e-05, "loss": 0.6458, "step": 750 }, { "epoch": 0.99, "grad_norm": 3.9207661151885986, "learning_rate": 4.87986092635656e-05, "loss": 0.6493, "step": 755 }, { "epoch": 1.0, "grad_norm": 3.7753255367279053, "learning_rate": 4.878277493873388e-05, "loss": 0.6141, "step": 760 }, { "epoch": 1.0, "grad_norm": 2.760272979736328, "learning_rate": 4.876683955090267e-05, "loss": 0.4732, "step": 765 }, { "epoch": 1.01, "grad_norm": 2.363640069961548, "learning_rate": 4.8750803167788136e-05, "loss": 0.2479, "step": 770 }, { "epoch": 1.02, "grad_norm": 2.5096426010131836, "learning_rate": 4.87346658575356e-05, "loss": 0.2297, "step": 775 }, { "epoch": 1.02, "grad_norm": 2.936283826828003, "learning_rate": 4.871842768871928e-05, "loss": 0.231, "step": 780 }, { "epoch": 1.03, "grad_norm": 2.782768964767456, "learning_rate": 4.8702088730341965e-05, "loss": 0.2195, "step": 785 }, { "epoch": 1.04, "grad_norm": 2.589066982269287, "learning_rate": 4.868564905183476e-05, "loss": 0.2205, "step": 790 }, { "epoch": 1.04, "grad_norm": 2.7772037982940674, "learning_rate": 4.866910872305675e-05, "loss": 0.2144, "step": 795 }, { "epoch": 1.05, "grad_norm": 2.887826919555664, "learning_rate": 4.865246781429476e-05, "loss": 0.2244, "step": 800 }, { "epoch": 1.06, "grad_norm": 2.558298349380493, "learning_rate": 4.8635726396262996e-05, "loss": 0.2422, "step": 805 }, { "epoch": 1.06, "grad_norm": 2.575716972351074, "learning_rate": 4.861888454010275e-05, "loss": 0.2223, "step": 810 }, { "epoch": 1.07, "grad_norm": 2.8698527812957764, "learning_rate": 4.860194231738216e-05, "loss": 0.2164, "step": 815 }, { "epoch": 1.07, "grad_norm": 2.723947763442993, "learning_rate": 4.8584899800095864e-05, "loss": 0.2332, "step": 820 }, { "epoch": 1.08, "grad_norm": 2.9089255332946777, "learning_rate": 4.8567757060664644e-05, "loss": 0.2419, "step": 825 }, { "epoch": 1.09, "grad_norm": 2.4928860664367676, "learning_rate": 4.8550514171935214e-05, "loss": 0.2268, "step": 830 }, { "epoch": 1.09, "grad_norm": 2.4811110496520996, "learning_rate": 4.853317120717985e-05, "loss": 0.2137, "step": 835 }, { "epoch": 1.1, "grad_norm": 3.3581888675689697, "learning_rate": 4.85157282400961e-05, "loss": 0.2416, "step": 840 }, { "epoch": 1.11, "grad_norm": 2.5201902389526367, "learning_rate": 4.849818534480645e-05, "loss": 0.2263, "step": 845 }, { "epoch": 1.11, "grad_norm": 2.51816463470459, "learning_rate": 4.8480542595858025e-05, "loss": 0.2346, "step": 850 }, { "epoch": 1.12, "grad_norm": 2.437803030014038, "learning_rate": 4.846280006822228e-05, "loss": 0.2311, "step": 855 }, { "epoch": 1.13, "grad_norm": 2.6363041400909424, "learning_rate": 4.844495783729467e-05, "loss": 0.2364, "step": 860 }, { "epoch": 1.13, "grad_norm": 2.600130796432495, "learning_rate": 4.842701597889432e-05, "loss": 0.2292, "step": 865 }, { "epoch": 1.14, "grad_norm": 2.5062355995178223, "learning_rate": 4.840897456926373e-05, "loss": 0.253, "step": 870 }, { "epoch": 1.15, "grad_norm": 2.7811343669891357, "learning_rate": 4.8390833685068424e-05, "loss": 0.2347, "step": 875 }, { "epoch": 1.15, "grad_norm": 2.58297061920166, "learning_rate": 4.837259340339665e-05, "loss": 0.2313, "step": 880 }, { "epoch": 1.16, "grad_norm": 2.850160598754883, "learning_rate": 4.8354253801759e-05, "loss": 0.2433, "step": 885 }, { "epoch": 1.17, "grad_norm": 2.6711654663085938, "learning_rate": 4.8335814958088166e-05, "loss": 0.2384, "step": 890 }, { "epoch": 1.17, "grad_norm": 2.596914291381836, "learning_rate": 4.8317276950738525e-05, "loss": 0.2411, "step": 895 }, { "epoch": 1.18, "grad_norm": 2.9930615425109863, "learning_rate": 4.829863985848587e-05, "loss": 0.2381, "step": 900 }, { "epoch": 1.19, "grad_norm": 2.8596436977386475, "learning_rate": 4.827990376052702e-05, "loss": 0.2409, "step": 905 }, { "epoch": 1.19, "grad_norm": 2.607304573059082, "learning_rate": 4.826106873647953e-05, "loss": 0.2387, "step": 910 }, { "epoch": 1.2, "grad_norm": 2.747140645980835, "learning_rate": 4.824213486638133e-05, "loss": 0.2552, "step": 915 }, { "epoch": 1.21, "grad_norm": 2.7441892623901367, "learning_rate": 4.822310223069039e-05, "loss": 0.2414, "step": 920 }, { "epoch": 1.21, "grad_norm": 2.480534791946411, "learning_rate": 4.820397091028436e-05, "loss": 0.2451, "step": 925 }, { "epoch": 1.22, "grad_norm": 2.65727162361145, "learning_rate": 4.818474098646026e-05, "loss": 0.2271, "step": 930 }, { "epoch": 1.23, "grad_norm": 2.657895088195801, "learning_rate": 4.8165412540934116e-05, "loss": 0.2463, "step": 935 }, { "epoch": 1.23, "grad_norm": 2.8118693828582764, "learning_rate": 4.814598565584062e-05, "loss": 0.2586, "step": 940 }, { "epoch": 1.24, "grad_norm": 2.7916576862335205, "learning_rate": 4.812646041373275e-05, "loss": 0.2487, "step": 945 }, { "epoch": 1.25, "grad_norm": 2.8312575817108154, "learning_rate": 4.810683689758147e-05, "loss": 0.2448, "step": 950 }, { "epoch": 1.25, "grad_norm": 2.789888620376587, "learning_rate": 4.808711519077534e-05, "loss": 0.25, "step": 955 }, { "epoch": 1.26, "grad_norm": 2.660008192062378, "learning_rate": 4.806729537712017e-05, "loss": 0.2592, "step": 960 }, { "epoch": 1.26, "grad_norm": 2.80081844329834, "learning_rate": 4.8047377540838676e-05, "loss": 0.2633, "step": 965 }, { "epoch": 1.27, "grad_norm": 2.5701184272766113, "learning_rate": 4.8027361766570117e-05, "loss": 0.2345, "step": 970 }, { "epoch": 1.28, "grad_norm": 2.6467089653015137, "learning_rate": 4.8007248139369915e-05, "loss": 0.2421, "step": 975 }, { "epoch": 1.28, "grad_norm": 2.8026981353759766, "learning_rate": 4.7987036744709326e-05, "loss": 0.2462, "step": 980 }, { "epoch": 1.29, "grad_norm": 2.9150643348693848, "learning_rate": 4.7966727668475044e-05, "loss": 0.2516, "step": 985 }, { "epoch": 1.3, "grad_norm": 2.872527837753296, "learning_rate": 4.794632099696888e-05, "loss": 0.2581, "step": 990 }, { "epoch": 1.3, "grad_norm": 2.764134168624878, "learning_rate": 4.792581681690734e-05, "loss": 0.2707, "step": 995 }, { "epoch": 1.31, "grad_norm": 2.886357069015503, "learning_rate": 4.790521521542129e-05, "loss": 0.2573, "step": 1000 }, { "epoch": 1.32, "grad_norm": 2.990485429763794, "learning_rate": 4.788451628005561e-05, "loss": 0.2634, "step": 1005 }, { "epoch": 1.32, "grad_norm": 2.758971691131592, "learning_rate": 4.786372009876876e-05, "loss": 0.2439, "step": 1010 }, { "epoch": 1.33, "grad_norm": 2.70831561088562, "learning_rate": 4.784282675993245e-05, "loss": 0.241, "step": 1015 }, { "epoch": 1.34, "grad_norm": 2.6341211795806885, "learning_rate": 4.782183635233124e-05, "loss": 0.2652, "step": 1020 }, { "epoch": 1.34, "grad_norm": 2.7551965713500977, "learning_rate": 4.780074896516219e-05, "loss": 0.244, "step": 1025 }, { "epoch": 1.35, "grad_norm": 3.252516508102417, "learning_rate": 4.7779564688034476e-05, "loss": 0.2594, "step": 1030 }, { "epoch": 1.36, "grad_norm": 2.93808913230896, "learning_rate": 4.7758283610968985e-05, "loss": 0.2594, "step": 1035 }, { "epoch": 1.36, "grad_norm": 2.767031192779541, "learning_rate": 4.773690582439795e-05, "loss": 0.2506, "step": 1040 }, { "epoch": 1.37, "grad_norm": 2.6166746616363525, "learning_rate": 4.7715431419164566e-05, "loss": 0.2624, "step": 1045 }, { "epoch": 1.38, "grad_norm": 2.9592745304107666, "learning_rate": 4.7693860486522604e-05, "loss": 0.2735, "step": 1050 }, { "epoch": 1.38, "grad_norm": 2.8421945571899414, "learning_rate": 4.7672193118136e-05, "loss": 0.2693, "step": 1055 }, { "epoch": 1.39, "grad_norm": 3.0941479206085205, "learning_rate": 4.7650429406078525e-05, "loss": 0.2563, "step": 1060 }, { "epoch": 1.4, "grad_norm": 2.8086464405059814, "learning_rate": 4.762856944283331e-05, "loss": 0.2627, "step": 1065 }, { "epoch": 1.4, "grad_norm": 2.981468439102173, "learning_rate": 4.760661332129254e-05, "loss": 0.2739, "step": 1070 }, { "epoch": 1.41, "grad_norm": 2.7119858264923096, "learning_rate": 4.758456113475699e-05, "loss": 0.2697, "step": 1075 }, { "epoch": 1.42, "grad_norm": 2.955040454864502, "learning_rate": 4.756241297693566e-05, "loss": 0.2713, "step": 1080 }, { "epoch": 1.42, "grad_norm": 2.78459095954895, "learning_rate": 4.7540168941945376e-05, "loss": 0.2659, "step": 1085 }, { "epoch": 1.43, "grad_norm": 2.754824161529541, "learning_rate": 4.751782912431038e-05, "loss": 0.2527, "step": 1090 }, { "epoch": 1.44, "grad_norm": 2.916003465652466, "learning_rate": 4.749539361896195e-05, "loss": 0.2554, "step": 1095 }, { "epoch": 1.44, "grad_norm": 2.9990346431732178, "learning_rate": 4.747286252123797e-05, "loss": 0.2449, "step": 1100 }, { "epoch": 1.45, "grad_norm": 2.68816876411438, "learning_rate": 4.7450235926882524e-05, "loss": 0.2539, "step": 1105 }, { "epoch": 1.46, "grad_norm": 2.7783591747283936, "learning_rate": 4.742751393204553e-05, "loss": 0.2673, "step": 1110 }, { "epoch": 1.46, "grad_norm": 3.041889190673828, "learning_rate": 4.740469663328228e-05, "loss": 0.2692, "step": 1115 }, { "epoch": 1.47, "grad_norm": 3.2789931297302246, "learning_rate": 4.738178412755306e-05, "loss": 0.2691, "step": 1120 }, { "epoch": 1.47, "grad_norm": 2.8584647178649902, "learning_rate": 4.7358776512222737e-05, "loss": 0.2722, "step": 1125 }, { "epoch": 1.48, "grad_norm": 2.982015371322632, "learning_rate": 4.7335673885060316e-05, "loss": 0.2721, "step": 1130 }, { "epoch": 1.49, "grad_norm": 2.9325811862945557, "learning_rate": 4.731247634423858e-05, "loss": 0.2791, "step": 1135 }, { "epoch": 1.49, "grad_norm": 2.9873268604278564, "learning_rate": 4.728918398833361e-05, "loss": 0.2805, "step": 1140 }, { "epoch": 1.5, "grad_norm": 2.8286678791046143, "learning_rate": 4.726579691632442e-05, "loss": 0.2628, "step": 1145 }, { "epoch": 1.51, "grad_norm": 2.6870853900909424, "learning_rate": 4.7242315227592496e-05, "loss": 0.2697, "step": 1150 }, { "epoch": 1.51, "grad_norm": 2.881246566772461, "learning_rate": 4.721873902192139e-05, "loss": 0.2786, "step": 1155 }, { "epoch": 1.52, "grad_norm": 2.676746129989624, "learning_rate": 4.719506839949631e-05, "loss": 0.2795, "step": 1160 }, { "epoch": 1.53, "grad_norm": 2.8064475059509277, "learning_rate": 4.717130346090368e-05, "loss": 0.2729, "step": 1165 }, { "epoch": 1.53, "grad_norm": 2.7660868167877197, "learning_rate": 4.7147444307130686e-05, "loss": 0.2752, "step": 1170 }, { "epoch": 1.54, "grad_norm": 2.8748722076416016, "learning_rate": 4.71234910395649e-05, "loss": 0.2772, "step": 1175 }, { "epoch": 1.55, "grad_norm": 2.691197633743286, "learning_rate": 4.7099443759993837e-05, "loss": 0.256, "step": 1180 }, { "epoch": 1.55, "grad_norm": 2.8552544116973877, "learning_rate": 4.707530257060445e-05, "loss": 0.2758, "step": 1185 }, { "epoch": 1.56, "grad_norm": 2.7499427795410156, "learning_rate": 4.705106757398282e-05, "loss": 0.2628, "step": 1190 }, { "epoch": 1.57, "grad_norm": 2.6907596588134766, "learning_rate": 4.702673887311362e-05, "loss": 0.2662, "step": 1195 }, { "epoch": 1.57, "grad_norm": 2.7225170135498047, "learning_rate": 4.7002316571379715e-05, "loss": 0.2709, "step": 1200 }, { "epoch": 1.58, "grad_norm": 3.2904715538024902, "learning_rate": 4.697780077256172e-05, "loss": 0.2853, "step": 1205 }, { "epoch": 1.59, "grad_norm": 2.7764620780944824, "learning_rate": 4.695319158083756e-05, "loss": 0.2623, "step": 1210 }, { "epoch": 1.59, "grad_norm": 3.36917781829834, "learning_rate": 4.6928489100782046e-05, "loss": 0.2806, "step": 1215 }, { "epoch": 1.6, "grad_norm": 3.3074262142181396, "learning_rate": 4.690369343736636e-05, "loss": 0.2834, "step": 1220 }, { "epoch": 1.61, "grad_norm": 2.958819627761841, "learning_rate": 4.6878804695957716e-05, "loss": 0.2787, "step": 1225 }, { "epoch": 1.61, "grad_norm": 2.8270795345306396, "learning_rate": 4.6853822982318816e-05, "loss": 0.2737, "step": 1230 }, { "epoch": 1.62, "grad_norm": 2.6642744541168213, "learning_rate": 4.682874840260746e-05, "loss": 0.2872, "step": 1235 }, { "epoch": 1.63, "grad_norm": 3.0754623413085938, "learning_rate": 4.680358106337607e-05, "loss": 0.2674, "step": 1240 }, { "epoch": 1.63, "grad_norm": 3.076148271560669, "learning_rate": 4.6778321071571224e-05, "loss": 0.2769, "step": 1245 }, { "epoch": 1.64, "grad_norm": 2.8592352867126465, "learning_rate": 4.675296853453326e-05, "loss": 0.2799, "step": 1250 }, { "epoch": 1.65, "grad_norm": 3.153860330581665, "learning_rate": 4.6727523559995734e-05, "loss": 0.2812, "step": 1255 }, { "epoch": 1.65, "grad_norm": 3.1477208137512207, "learning_rate": 4.6701986256085046e-05, "loss": 0.2818, "step": 1260 }, { "epoch": 1.66, "grad_norm": 3.040626049041748, "learning_rate": 4.667635673131992e-05, "loss": 0.2832, "step": 1265 }, { "epoch": 1.66, "grad_norm": 3.204580307006836, "learning_rate": 4.665063509461097e-05, "loss": 0.3009, "step": 1270 }, { "epoch": 1.67, "grad_norm": 2.8025059700012207, "learning_rate": 4.662482145526024e-05, "loss": 0.2776, "step": 1275 }, { "epoch": 1.68, "grad_norm": 3.0659685134887695, "learning_rate": 4.659891592296071e-05, "loss": 0.291, "step": 1280 }, { "epoch": 1.68, "grad_norm": 2.9462106227874756, "learning_rate": 4.6572918607795876e-05, "loss": 0.287, "step": 1285 }, { "epoch": 1.69, "grad_norm": 3.0103273391723633, "learning_rate": 4.6546829620239265e-05, "loss": 0.3025, "step": 1290 }, { "epoch": 1.7, "grad_norm": 2.912851095199585, "learning_rate": 4.6520649071153916e-05, "loss": 0.2675, "step": 1295 }, { "epoch": 1.7, "grad_norm": 3.1437137126922607, "learning_rate": 4.6494377071791996e-05, "loss": 0.2896, "step": 1300 }, { "epoch": 1.71, "grad_norm": 2.8913474082946777, "learning_rate": 4.646801373379425e-05, "loss": 0.3142, "step": 1305 }, { "epoch": 1.72, "grad_norm": 3.0581839084625244, "learning_rate": 4.644155916918959e-05, "loss": 0.293, "step": 1310 }, { "epoch": 1.72, "grad_norm": 2.8686771392822266, "learning_rate": 4.641501349039456e-05, "loss": 0.273, "step": 1315 }, { "epoch": 1.73, "grad_norm": 2.914700984954834, "learning_rate": 4.6388376810212905e-05, "loss": 0.2837, "step": 1320 }, { "epoch": 1.74, "grad_norm": 3.2269139289855957, "learning_rate": 4.6361649241835056e-05, "loss": 0.2849, "step": 1325 }, { "epoch": 1.74, "grad_norm": 3.0138943195343018, "learning_rate": 4.633483089883769e-05, "loss": 0.2854, "step": 1330 }, { "epoch": 1.75, "grad_norm": 3.2977559566497803, "learning_rate": 4.63079218951832e-05, "loss": 0.2922, "step": 1335 }, { "epoch": 1.76, "grad_norm": 3.0085713863372803, "learning_rate": 4.6280922345219255e-05, "loss": 0.2838, "step": 1340 }, { "epoch": 1.76, "grad_norm": 3.183983087539673, "learning_rate": 4.625383236367827e-05, "loss": 0.282, "step": 1345 }, { "epoch": 1.77, "grad_norm": 2.8702237606048584, "learning_rate": 4.6226652065676974e-05, "loss": 0.2786, "step": 1350 }, { "epoch": 1.78, "grad_norm": 3.479321241378784, "learning_rate": 4.619938156671584e-05, "loss": 0.2904, "step": 1355 }, { "epoch": 1.78, "grad_norm": 2.9285452365875244, "learning_rate": 4.61720209826787e-05, "loss": 0.2861, "step": 1360 }, { "epoch": 1.79, "grad_norm": 3.244591236114502, "learning_rate": 4.6144570429832144e-05, "loss": 0.2928, "step": 1365 }, { "epoch": 1.8, "grad_norm": 2.8110570907592773, "learning_rate": 4.6117030024825114e-05, "loss": 0.2904, "step": 1370 }, { "epoch": 1.8, "grad_norm": 3.049492359161377, "learning_rate": 4.6089399884688356e-05, "loss": 0.2739, "step": 1375 }, { "epoch": 1.81, "grad_norm": 2.960361957550049, "learning_rate": 4.606168012683394e-05, "loss": 0.3031, "step": 1380 }, { "epoch": 1.82, "grad_norm": 3.257373571395874, "learning_rate": 4.603387086905475e-05, "loss": 0.2993, "step": 1385 }, { "epoch": 1.82, "grad_norm": 3.0115904808044434, "learning_rate": 4.600597222952402e-05, "loss": 0.2915, "step": 1390 }, { "epoch": 1.83, "grad_norm": 3.111074209213257, "learning_rate": 4.597798432679477e-05, "loss": 0.2948, "step": 1395 }, { "epoch": 1.84, "grad_norm": 3.1926794052124023, "learning_rate": 4.594990727979937e-05, "loss": 0.2971, "step": 1400 }, { "epoch": 1.84, "grad_norm": 2.913715362548828, "learning_rate": 4.5921741207848966e-05, "loss": 0.2844, "step": 1405 }, { "epoch": 1.85, "grad_norm": 2.8652007579803467, "learning_rate": 4.5893486230633037e-05, "loss": 0.2687, "step": 1410 }, { "epoch": 1.85, "grad_norm": 2.927306890487671, "learning_rate": 4.586514246821885e-05, "loss": 0.2984, "step": 1415 }, { "epoch": 1.86, "grad_norm": 3.2218594551086426, "learning_rate": 4.583671004105096e-05, "loss": 0.2928, "step": 1420 }, { "epoch": 1.87, "grad_norm": 3.1091806888580322, "learning_rate": 4.580818906995068e-05, "loss": 0.3024, "step": 1425 }, { "epoch": 1.87, "grad_norm": 3.152013063430786, "learning_rate": 4.5779579676115604e-05, "loss": 0.2898, "step": 1430 }, { "epoch": 1.88, "grad_norm": 3.037785053253174, "learning_rate": 4.575088198111905e-05, "loss": 0.3012, "step": 1435 }, { "epoch": 1.89, "grad_norm": 3.125337600708008, "learning_rate": 4.5722096106909595e-05, "loss": 0.2982, "step": 1440 }, { "epoch": 1.89, "grad_norm": 3.1015219688415527, "learning_rate": 4.56932221758105e-05, "loss": 0.3014, "step": 1445 }, { "epoch": 1.9, "grad_norm": 3.0641446113586426, "learning_rate": 4.566426031051922e-05, "loss": 0.3057, "step": 1450 }, { "epoch": 1.91, "grad_norm": 3.1846718788146973, "learning_rate": 4.56352106341069e-05, "loss": 0.2941, "step": 1455 }, { "epoch": 1.91, "grad_norm": 2.9871373176574707, "learning_rate": 4.56060732700178e-05, "loss": 0.2902, "step": 1460 }, { "epoch": 1.92, "grad_norm": 2.941716194152832, "learning_rate": 4.5576848342068826e-05, "loss": 0.2999, "step": 1465 }, { "epoch": 1.93, "grad_norm": 2.8153445720672607, "learning_rate": 4.554753597444896e-05, "loss": 0.2855, "step": 1470 }, { "epoch": 1.93, "grad_norm": 3.2046408653259277, "learning_rate": 4.551813629171878e-05, "loss": 0.3167, "step": 1475 }, { "epoch": 1.94, "grad_norm": 3.2123496532440186, "learning_rate": 4.548864941880988e-05, "loss": 0.2929, "step": 1480 }, { "epoch": 1.95, "grad_norm": 2.81064772605896, "learning_rate": 4.545907548102436e-05, "loss": 0.3059, "step": 1485 }, { "epoch": 1.95, "grad_norm": 3.07346248626709, "learning_rate": 4.5429414604034307e-05, "loss": 0.2902, "step": 1490 }, { "epoch": 1.96, "grad_norm": 2.8002560138702393, "learning_rate": 4.539966691388125e-05, "loss": 0.2918, "step": 1495 }, { "epoch": 1.97, "grad_norm": 3.3515923023223877, "learning_rate": 4.536983253697561e-05, "loss": 0.304, "step": 1500 }, { "epoch": 1.97, "grad_norm": 3.050218105316162, "learning_rate": 4.53399116000962e-05, "loss": 0.3163, "step": 1505 }, { "epoch": 1.98, "grad_norm": 3.1914007663726807, "learning_rate": 4.530990423038962e-05, "loss": 0.3071, "step": 1510 }, { "epoch": 1.99, "grad_norm": 3.180460214614868, "learning_rate": 4.527981055536982e-05, "loss": 0.3023, "step": 1515 }, { "epoch": 1.99, "grad_norm": 3.2100706100463867, "learning_rate": 4.524963070291744e-05, "loss": 0.3219, "step": 1520 }, { "epoch": 2.0, "grad_norm": 2.9520275592803955, "learning_rate": 4.5219364801279356e-05, "loss": 0.2968, "step": 1525 }, { "epoch": 2.01, "grad_norm": 2.4291439056396484, "learning_rate": 4.51890129790681e-05, "loss": 0.17, "step": 1530 }, { "epoch": 2.01, "grad_norm": 1.9606090784072876, "learning_rate": 4.5158575365261305e-05, "loss": 0.1316, "step": 1535 }, { "epoch": 2.02, "grad_norm": 2.126908779144287, "learning_rate": 4.512805208920118e-05, "loss": 0.1281, "step": 1540 }, { "epoch": 2.03, "grad_norm": 2.0146312713623047, "learning_rate": 4.509744328059395e-05, "loss": 0.1234, "step": 1545 }, { "epoch": 2.03, "grad_norm": 1.9698853492736816, "learning_rate": 4.506674906950929e-05, "loss": 0.1341, "step": 1550 }, { "epoch": 2.04, "grad_norm": 2.1764025688171387, "learning_rate": 4.5035969586379804e-05, "loss": 0.1331, "step": 1555 }, { "epoch": 2.04, "grad_norm": 2.2242555618286133, "learning_rate": 4.5005104962000436e-05, "loss": 0.1325, "step": 1560 }, { "epoch": 2.05, "grad_norm": 2.019362449645996, "learning_rate": 4.4974155327527926e-05, "loss": 0.1219, "step": 1565 }, { "epoch": 2.06, "grad_norm": 2.3239810466766357, "learning_rate": 4.494312081448029e-05, "loss": 0.1304, "step": 1570 }, { "epoch": 2.06, "grad_norm": 2.2973790168762207, "learning_rate": 4.4912001554736205e-05, "loss": 0.1316, "step": 1575 }, { "epoch": 2.07, "grad_norm": 2.4513959884643555, "learning_rate": 4.488079768053447e-05, "loss": 0.133, "step": 1580 }, { "epoch": 2.08, "grad_norm": 2.789614200592041, "learning_rate": 4.484950932447345e-05, "loss": 0.1378, "step": 1585 }, { "epoch": 2.08, "grad_norm": 2.2913756370544434, "learning_rate": 4.481813661951052e-05, "loss": 0.1287, "step": 1590 }, { "epoch": 2.09, "grad_norm": 2.1334588527679443, "learning_rate": 4.4786679698961476e-05, "loss": 0.1304, "step": 1595 }, { "epoch": 2.1, "grad_norm": 2.3002805709838867, "learning_rate": 4.475513869649998e-05, "loss": 0.134, "step": 1600 }, { "epoch": 2.1, "grad_norm": 2.2173187732696533, "learning_rate": 4.4723513746157004e-05, "loss": 0.1359, "step": 1605 }, { "epoch": 2.11, "grad_norm": 1.9922655820846558, "learning_rate": 4.469180498232024e-05, "loss": 0.1403, "step": 1610 }, { "epoch": 2.12, "grad_norm": 2.208549737930298, "learning_rate": 4.466001253973355e-05, "loss": 0.1316, "step": 1615 }, { "epoch": 2.12, "grad_norm": 2.4228994846343994, "learning_rate": 4.4628136553496375e-05, "loss": 0.1336, "step": 1620 }, { "epoch": 2.13, "grad_norm": 2.2046756744384766, "learning_rate": 4.459617715906316e-05, "loss": 0.1389, "step": 1625 }, { "epoch": 2.14, "grad_norm": 2.3668532371520996, "learning_rate": 4.4564134492242805e-05, "loss": 0.1374, "step": 1630 }, { "epoch": 2.14, "grad_norm": 2.3358521461486816, "learning_rate": 4.4532008689198056e-05, "loss": 0.1339, "step": 1635 }, { "epoch": 2.15, "grad_norm": 2.4201912879943848, "learning_rate": 4.449979988644494e-05, "loss": 0.1324, "step": 1640 }, { "epoch": 2.16, "grad_norm": 2.356771230697632, "learning_rate": 4.446750822085218e-05, "loss": 0.1496, "step": 1645 }, { "epoch": 2.16, "grad_norm": 2.5749542713165283, "learning_rate": 4.4435133829640645e-05, "loss": 0.1446, "step": 1650 }, { "epoch": 2.17, "grad_norm": 2.313682794570923, "learning_rate": 4.440267685038271e-05, "loss": 0.1417, "step": 1655 }, { "epoch": 2.18, "grad_norm": 2.3327279090881348, "learning_rate": 4.437013742100171e-05, "loss": 0.1341, "step": 1660 }, { "epoch": 2.18, "grad_norm": 2.482767105102539, "learning_rate": 4.4337515679771345e-05, "loss": 0.1402, "step": 1665 }, { "epoch": 2.19, "grad_norm": 2.6034271717071533, "learning_rate": 4.4304811765315105e-05, "loss": 0.1498, "step": 1670 }, { "epoch": 2.2, "grad_norm": 2.2677841186523438, "learning_rate": 4.427202581660565e-05, "loss": 0.1414, "step": 1675 }, { "epoch": 2.2, "grad_norm": 2.3339622020721436, "learning_rate": 4.423915797296425e-05, "loss": 0.1377, "step": 1680 }, { "epoch": 2.21, "grad_norm": 2.1083145141601562, "learning_rate": 4.420620837406018e-05, "loss": 0.1416, "step": 1685 }, { "epoch": 2.22, "grad_norm": 2.400583267211914, "learning_rate": 4.4173177159910106e-05, "loss": 0.1383, "step": 1690 }, { "epoch": 2.22, "grad_norm": 2.1524839401245117, "learning_rate": 4.414006447087755e-05, "loss": 0.1366, "step": 1695 }, { "epoch": 2.23, "grad_norm": 2.1756019592285156, "learning_rate": 4.410687044767223e-05, "loss": 0.1402, "step": 1700 }, { "epoch": 2.23, "grad_norm": 2.5507566928863525, "learning_rate": 4.407359523134949e-05, "loss": 0.1514, "step": 1705 }, { "epoch": 2.24, "grad_norm": 2.152941942214966, "learning_rate": 4.4040238963309696e-05, "loss": 0.1451, "step": 1710 }, { "epoch": 2.25, "grad_norm": 2.3613879680633545, "learning_rate": 4.400680178529765e-05, "loss": 0.1407, "step": 1715 }, { "epoch": 2.25, "grad_norm": 2.624096393585205, "learning_rate": 4.397328383940196e-05, "loss": 0.1428, "step": 1720 }, { "epoch": 2.26, "grad_norm": 2.358207941055298, "learning_rate": 4.393968526805447e-05, "loss": 0.1443, "step": 1725 }, { "epoch": 2.27, "grad_norm": 2.758371353149414, "learning_rate": 4.3906006214029585e-05, "loss": 0.1568, "step": 1730 }, { "epoch": 2.27, "grad_norm": 2.099876642227173, "learning_rate": 4.387224682044378e-05, "loss": 0.157, "step": 1735 }, { "epoch": 2.28, "grad_norm": 2.3019683361053467, "learning_rate": 4.3838407230754885e-05, "loss": 0.1404, "step": 1740 }, { "epoch": 2.29, "grad_norm": 2.589655637741089, "learning_rate": 4.3804487588761544e-05, "loss": 0.156, "step": 1745 }, { "epoch": 2.29, "grad_norm": 2.4104435443878174, "learning_rate": 4.3770488038602555e-05, "loss": 0.1467, "step": 1750 }, { "epoch": 2.3, "grad_norm": 2.6529500484466553, "learning_rate": 4.373640872475627e-05, "loss": 0.1475, "step": 1755 }, { "epoch": 2.31, "grad_norm": 2.272524833679199, "learning_rate": 4.370224979204003e-05, "loss": 0.1423, "step": 1760 }, { "epoch": 2.31, "grad_norm": 2.421292781829834, "learning_rate": 4.366801138560948e-05, "loss": 0.149, "step": 1765 }, { "epoch": 2.32, "grad_norm": 2.280380964279175, "learning_rate": 4.3633693650957976e-05, "loss": 0.1468, "step": 1770 }, { "epoch": 2.33, "grad_norm": 2.0802671909332275, "learning_rate": 4.3599296733916004e-05, "loss": 0.157, "step": 1775 }, { "epoch": 2.33, "grad_norm": 2.234787940979004, "learning_rate": 4.3564820780650496e-05, "loss": 0.1428, "step": 1780 }, { "epoch": 2.34, "grad_norm": 2.337618589401245, "learning_rate": 4.353026593766427e-05, "loss": 0.1459, "step": 1785 }, { "epoch": 2.35, "grad_norm": 2.529278516769409, "learning_rate": 4.3495632351795367e-05, "loss": 0.1617, "step": 1790 }, { "epoch": 2.35, "grad_norm": 2.2081286907196045, "learning_rate": 4.3460920170216425e-05, "loss": 0.1487, "step": 1795 }, { "epoch": 2.36, "grad_norm": 2.180853843688965, "learning_rate": 4.34261295404341e-05, "loss": 0.139, "step": 1800 }, { "epoch": 2.37, "grad_norm": 2.5588650703430176, "learning_rate": 4.339126061028837e-05, "loss": 0.1489, "step": 1805 }, { "epoch": 2.37, "grad_norm": 2.441371202468872, "learning_rate": 4.335631352795199e-05, "loss": 0.1544, "step": 1810 }, { "epoch": 2.38, "grad_norm": 2.429845094680786, "learning_rate": 4.332128844192977e-05, "loss": 0.151, "step": 1815 }, { "epoch": 2.39, "grad_norm": 2.3163018226623535, "learning_rate": 4.328618550105802e-05, "loss": 0.1521, "step": 1820 }, { "epoch": 2.39, "grad_norm": 2.4328958988189697, "learning_rate": 4.325100485450389e-05, "loss": 0.1581, "step": 1825 }, { "epoch": 2.4, "grad_norm": 2.343770980834961, "learning_rate": 4.3215746651764686e-05, "loss": 0.1544, "step": 1830 }, { "epoch": 2.41, "grad_norm": 2.4985294342041016, "learning_rate": 4.3180411042667354e-05, "loss": 0.1557, "step": 1835 }, { "epoch": 2.41, "grad_norm": 2.6652395725250244, "learning_rate": 4.314499817736773e-05, "loss": 0.1465, "step": 1840 }, { "epoch": 2.42, "grad_norm": 2.50243878364563, "learning_rate": 4.3109508206349945e-05, "loss": 0.1514, "step": 1845 }, { "epoch": 2.43, "grad_norm": 2.537421703338623, "learning_rate": 4.30739412804258e-05, "loss": 0.155, "step": 1850 }, { "epoch": 2.43, "grad_norm": 2.244147539138794, "learning_rate": 4.3038297550734096e-05, "loss": 0.15, "step": 1855 }, { "epoch": 2.44, "grad_norm": 2.4972686767578125, "learning_rate": 4.300257716874001e-05, "loss": 0.1559, "step": 1860 }, { "epoch": 2.44, "grad_norm": 2.495651960372925, "learning_rate": 4.296678028623446e-05, "loss": 0.1589, "step": 1865 }, { "epoch": 2.45, "grad_norm": 2.649902582168579, "learning_rate": 4.293090705533342e-05, "loss": 0.1528, "step": 1870 }, { "epoch": 2.46, "grad_norm": 2.281095266342163, "learning_rate": 4.2894957628477316e-05, "loss": 0.1639, "step": 1875 }, { "epoch": 2.46, "grad_norm": 2.5233304500579834, "learning_rate": 4.285893215843036e-05, "loss": 0.1528, "step": 1880 }, { "epoch": 2.47, "grad_norm": 2.6843202114105225, "learning_rate": 4.282283079827993e-05, "loss": 0.1623, "step": 1885 }, { "epoch": 2.48, "grad_norm": 2.4476354122161865, "learning_rate": 4.278665370143583e-05, "loss": 0.1562, "step": 1890 }, { "epoch": 2.48, "grad_norm": 2.337167501449585, "learning_rate": 4.2750401021629765e-05, "loss": 0.165, "step": 1895 }, { "epoch": 2.49, "grad_norm": 2.610464096069336, "learning_rate": 4.271407291291459e-05, "loss": 0.1591, "step": 1900 }, { "epoch": 2.5, "grad_norm": 2.4951589107513428, "learning_rate": 4.267766952966369e-05, "loss": 0.1587, "step": 1905 }, { "epoch": 2.5, "grad_norm": 2.0912039279937744, "learning_rate": 4.2641191026570336e-05, "loss": 0.1529, "step": 1910 }, { "epoch": 2.51, "grad_norm": 2.724330425262451, "learning_rate": 4.260463755864702e-05, "loss": 0.1693, "step": 1915 }, { "epoch": 2.52, "grad_norm": 2.3671672344207764, "learning_rate": 4.256800928122475e-05, "loss": 0.157, "step": 1920 }, { "epoch": 2.52, "grad_norm": 2.536565065383911, "learning_rate": 4.2531306349952496e-05, "loss": 0.1697, "step": 1925 }, { "epoch": 2.53, "grad_norm": 2.5092501640319824, "learning_rate": 4.2494528920796406e-05, "loss": 0.1655, "step": 1930 }, { "epoch": 2.54, "grad_norm": 2.6707546710968018, "learning_rate": 4.2457677150039224e-05, "loss": 0.1604, "step": 1935 }, { "epoch": 2.54, "grad_norm": 2.4832890033721924, "learning_rate": 4.242075119427961e-05, "loss": 0.1504, "step": 1940 }, { "epoch": 2.55, "grad_norm": 2.4126479625701904, "learning_rate": 4.238375121043145e-05, "loss": 0.1552, "step": 1945 }, { "epoch": 2.56, "grad_norm": 2.3602805137634277, "learning_rate": 4.234667735572323e-05, "loss": 0.1556, "step": 1950 }, { "epoch": 2.56, "grad_norm": 2.4358716011047363, "learning_rate": 4.230952978769731e-05, "loss": 0.1569, "step": 1955 }, { "epoch": 2.57, "grad_norm": 2.6005828380584717, "learning_rate": 4.227230866420932e-05, "loss": 0.158, "step": 1960 }, { "epoch": 2.58, "grad_norm": 2.054624557495117, "learning_rate": 4.223501414342745e-05, "loss": 0.1644, "step": 1965 }, { "epoch": 2.58, "grad_norm": 2.5402703285217285, "learning_rate": 4.219764638383177e-05, "loss": 0.1587, "step": 1970 }, { "epoch": 2.59, "grad_norm": 2.09084153175354, "learning_rate": 4.216020554421359e-05, "loss": 0.1561, "step": 1975 }, { "epoch": 2.6, "grad_norm": 2.529383659362793, "learning_rate": 4.2122691783674786e-05, "loss": 0.1656, "step": 1980 }, { "epoch": 2.6, "grad_norm": 2.9956157207489014, "learning_rate": 4.208510526162704e-05, "loss": 0.1649, "step": 1985 }, { "epoch": 2.61, "grad_norm": 2.585899591445923, "learning_rate": 4.20474461377913e-05, "loss": 0.1635, "step": 1990 }, { "epoch": 2.62, "grad_norm": 2.6515862941741943, "learning_rate": 4.200971457219699e-05, "loss": 0.1713, "step": 1995 }, { "epoch": 2.62, "grad_norm": 2.22086501121521, "learning_rate": 4.197191072518139e-05, "loss": 0.151, "step": 2000 }, { "epoch": 2.63, "grad_norm": 2.543677568435669, "learning_rate": 4.19340347573889e-05, "loss": 0.1756, "step": 2005 }, { "epoch": 2.63, "grad_norm": 2.5006985664367676, "learning_rate": 4.1896086829770445e-05, "loss": 0.152, "step": 2010 }, { "epoch": 2.64, "grad_norm": 2.534740924835205, "learning_rate": 4.185806710358268e-05, "loss": 0.1681, "step": 2015 }, { "epoch": 2.65, "grad_norm": 2.562382459640503, "learning_rate": 4.181997574038741e-05, "loss": 0.162, "step": 2020 }, { "epoch": 2.65, "grad_norm": 2.5193183422088623, "learning_rate": 4.178181290205082e-05, "loss": 0.1663, "step": 2025 }, { "epoch": 2.66, "grad_norm": 2.6807899475097656, "learning_rate": 4.174357875074285e-05, "loss": 0.1636, "step": 2030 }, { "epoch": 2.67, "grad_norm": 2.598508834838867, "learning_rate": 4.170527344893647e-05, "loss": 0.1704, "step": 2035 }, { "epoch": 2.67, "grad_norm": 2.301255464553833, "learning_rate": 4.1666897159406984e-05, "loss": 0.1644, "step": 2040 }, { "epoch": 2.68, "grad_norm": 2.4087393283843994, "learning_rate": 4.162845004523137e-05, "loss": 0.1739, "step": 2045 }, { "epoch": 2.69, "grad_norm": 2.3460421562194824, "learning_rate": 4.158993226978757e-05, "loss": 0.1658, "step": 2050 }, { "epoch": 2.69, "grad_norm": 2.640719175338745, "learning_rate": 4.155134399675378e-05, "loss": 0.1529, "step": 2055 }, { "epoch": 2.7, "grad_norm": 2.6366817951202393, "learning_rate": 4.151268539010777e-05, "loss": 0.176, "step": 2060 }, { "epoch": 2.71, "grad_norm": 2.4204182624816895, "learning_rate": 4.1473956614126225e-05, "loss": 0.1579, "step": 2065 }, { "epoch": 2.71, "grad_norm": 2.7791054248809814, "learning_rate": 4.1435157833383955e-05, "loss": 0.1604, "step": 2070 }, { "epoch": 2.72, "grad_norm": 2.4026386737823486, "learning_rate": 4.139628921275329e-05, "loss": 0.164, "step": 2075 }, { "epoch": 2.73, "grad_norm": 2.740560531616211, "learning_rate": 4.1357350917403314e-05, "loss": 0.1791, "step": 2080 }, { "epoch": 2.73, "grad_norm": 2.6298422813415527, "learning_rate": 4.131834311279919e-05, "loss": 0.1691, "step": 2085 }, { "epoch": 2.74, "grad_norm": 2.610245704650879, "learning_rate": 4.12792659647015e-05, "loss": 0.1694, "step": 2090 }, { "epoch": 2.75, "grad_norm": 2.5160694122314453, "learning_rate": 4.124011963916541e-05, "loss": 0.1712, "step": 2095 }, { "epoch": 2.75, "grad_norm": 2.4107940196990967, "learning_rate": 4.1200904302540136e-05, "loss": 0.1587, "step": 2100 }, { "epoch": 2.76, "grad_norm": 2.5999083518981934, "learning_rate": 4.116162012146809e-05, "loss": 0.1683, "step": 2105 }, { "epoch": 2.77, "grad_norm": 2.592486619949341, "learning_rate": 4.112226726288427e-05, "loss": 0.1673, "step": 2110 }, { "epoch": 2.77, "grad_norm": 2.6168549060821533, "learning_rate": 4.1082845894015495e-05, "loss": 0.1573, "step": 2115 }, { "epoch": 2.78, "grad_norm": 2.690314769744873, "learning_rate": 4.104335618237972e-05, "loss": 0.1763, "step": 2120 }, { "epoch": 2.79, "grad_norm": 2.612140417098999, "learning_rate": 4.1003798295785325e-05, "loss": 0.1671, "step": 2125 }, { "epoch": 2.79, "grad_norm": 2.6909706592559814, "learning_rate": 4.096417240233036e-05, "loss": 0.1653, "step": 2130 }, { "epoch": 2.8, "grad_norm": 2.353872299194336, "learning_rate": 4.092447867040191e-05, "loss": 0.1721, "step": 2135 }, { "epoch": 2.81, "grad_norm": 2.776252508163452, "learning_rate": 4.088471726867531e-05, "loss": 0.1792, "step": 2140 }, { "epoch": 2.81, "grad_norm": 2.5471363067626953, "learning_rate": 4.084488836611346e-05, "loss": 0.1728, "step": 2145 }, { "epoch": 2.82, "grad_norm": 2.5439553260803223, "learning_rate": 4.080499213196607e-05, "loss": 0.1734, "step": 2150 }, { "epoch": 2.82, "grad_norm": 2.695373773574829, "learning_rate": 4.076502873576903e-05, "loss": 0.1625, "step": 2155 }, { "epoch": 2.83, "grad_norm": 2.5151877403259277, "learning_rate": 4.072499834734357e-05, "loss": 0.1598, "step": 2160 }, { "epoch": 2.84, "grad_norm": 2.4009640216827393, "learning_rate": 4.068490113679563e-05, "loss": 0.1574, "step": 2165 }, { "epoch": 2.84, "grad_norm": 2.4583699703216553, "learning_rate": 4.06447372745151e-05, "loss": 0.1689, "step": 2170 }, { "epoch": 2.85, "grad_norm": 2.4071240425109863, "learning_rate": 4.060450693117511e-05, "loss": 0.1722, "step": 2175 }, { "epoch": 2.86, "grad_norm": 2.36995267868042, "learning_rate": 4.056421027773126e-05, "loss": 0.1709, "step": 2180 }, { "epoch": 2.86, "grad_norm": 2.5631325244903564, "learning_rate": 4.0523847485420984e-05, "loss": 0.173, "step": 2185 }, { "epoch": 2.87, "grad_norm": 2.7295174598693848, "learning_rate": 4.048341872576272e-05, "loss": 0.173, "step": 2190 }, { "epoch": 2.88, "grad_norm": 2.5564053058624268, "learning_rate": 4.044292417055525e-05, "loss": 0.1684, "step": 2195 }, { "epoch": 2.88, "grad_norm": 2.627962589263916, "learning_rate": 4.040236399187696e-05, "loss": 0.1717, "step": 2200 }, { "epoch": 2.89, "grad_norm": 2.5658953189849854, "learning_rate": 4.0361738362085064e-05, "loss": 0.1719, "step": 2205 }, { "epoch": 2.9, "grad_norm": 2.4268243312835693, "learning_rate": 4.032104745381494e-05, "loss": 0.1612, "step": 2210 }, { "epoch": 2.9, "grad_norm": 2.6990509033203125, "learning_rate": 4.028029143997935e-05, "loss": 0.1671, "step": 2215 }, { "epoch": 2.91, "grad_norm": 2.4805703163146973, "learning_rate": 4.0239470493767704e-05, "loss": 0.1735, "step": 2220 }, { "epoch": 2.92, "grad_norm": 2.5650622844696045, "learning_rate": 4.019858478864534e-05, "loss": 0.1662, "step": 2225 }, { "epoch": 2.92, "grad_norm": 2.4036471843719482, "learning_rate": 4.015763449835281e-05, "loss": 0.1571, "step": 2230 }, { "epoch": 2.93, "grad_norm": 2.5735135078430176, "learning_rate": 4.0116619796905104e-05, "loss": 0.1676, "step": 2235 }, { "epoch": 2.94, "grad_norm": 2.7664101123809814, "learning_rate": 4.0075540858590883e-05, "loss": 0.1825, "step": 2240 }, { "epoch": 2.94, "grad_norm": 2.384687900543213, "learning_rate": 4.003439785797183e-05, "loss": 0.169, "step": 2245 }, { "epoch": 2.95, "grad_norm": 2.6205379962921143, "learning_rate": 3.999319096988183e-05, "loss": 0.1745, "step": 2250 }, { "epoch": 2.96, "grad_norm": 2.52746319770813, "learning_rate": 3.995192036942625e-05, "loss": 0.166, "step": 2255 }, { "epoch": 2.96, "grad_norm": 2.5834269523620605, "learning_rate": 3.991058623198123e-05, "loss": 0.1758, "step": 2260 }, { "epoch": 2.97, "grad_norm": 2.5553784370422363, "learning_rate": 3.9869188733192846e-05, "loss": 0.1755, "step": 2265 }, { "epoch": 2.98, "grad_norm": 2.5766592025756836, "learning_rate": 3.982772804897649e-05, "loss": 0.1687, "step": 2270 }, { "epoch": 2.98, "grad_norm": 2.7371819019317627, "learning_rate": 3.978620435551599e-05, "loss": 0.1705, "step": 2275 }, { "epoch": 2.99, "grad_norm": 2.321173906326294, "learning_rate": 3.974461782926299e-05, "loss": 0.162, "step": 2280 }, { "epoch": 3.0, "grad_norm": 2.64091420173645, "learning_rate": 3.970296864693609e-05, "loss": 0.1652, "step": 2285 }, { "epoch": 3.0, "grad_norm": 1.3675230741500854, "learning_rate": 3.9661256985520156e-05, "loss": 0.1358, "step": 2290 }, { "epoch": 3.01, "grad_norm": 1.4976637363433838, "learning_rate": 3.961948302226557e-05, "loss": 0.0672, "step": 2295 }, { "epoch": 3.01, "grad_norm": 1.758554220199585, "learning_rate": 3.957764693468743e-05, "loss": 0.066, "step": 2300 }, { "epoch": 3.02, "grad_norm": 1.4874210357666016, "learning_rate": 3.953574890056485e-05, "loss": 0.0629, "step": 2305 }, { "epoch": 3.03, "grad_norm": 1.7129358053207397, "learning_rate": 3.9493789097940185e-05, "loss": 0.0642, "step": 2310 }, { "epoch": 3.03, "grad_norm": 1.8975974321365356, "learning_rate": 3.9451767705118246e-05, "loss": 0.0679, "step": 2315 }, { "epoch": 3.04, "grad_norm": 1.5627552270889282, "learning_rate": 3.940968490066559e-05, "loss": 0.0642, "step": 2320 }, { "epoch": 3.05, "grad_norm": 1.6444036960601807, "learning_rate": 3.9367540863409714e-05, "loss": 0.0691, "step": 2325 }, { "epoch": 3.05, "grad_norm": 1.6043704748153687, "learning_rate": 3.932533577243835e-05, "loss": 0.0644, "step": 2330 }, { "epoch": 3.06, "grad_norm": 1.6515140533447266, "learning_rate": 3.9283069807098636e-05, "loss": 0.0729, "step": 2335 }, { "epoch": 3.07, "grad_norm": 1.6297287940979004, "learning_rate": 3.9240743146996425e-05, "loss": 0.068, "step": 2340 }, { "epoch": 3.07, "grad_norm": 1.557881236076355, "learning_rate": 3.919835597199548e-05, "loss": 0.0688, "step": 2345 }, { "epoch": 3.08, "grad_norm": 1.708101511001587, "learning_rate": 3.915590846221669e-05, "loss": 0.0673, "step": 2350 }, { "epoch": 3.09, "grad_norm": 1.6620030403137207, "learning_rate": 3.911340079803736e-05, "loss": 0.0702, "step": 2355 }, { "epoch": 3.09, "grad_norm": 1.938750982284546, "learning_rate": 3.9070833160090415e-05, "loss": 0.0695, "step": 2360 }, { "epoch": 3.1, "grad_norm": 1.7759830951690674, "learning_rate": 3.902820572926362e-05, "loss": 0.0732, "step": 2365 }, { "epoch": 3.11, "grad_norm": 1.846912145614624, "learning_rate": 3.898551868669883e-05, "loss": 0.0668, "step": 2370 }, { "epoch": 3.11, "grad_norm": 1.79542076587677, "learning_rate": 3.8942772213791224e-05, "loss": 0.0714, "step": 2375 }, { "epoch": 3.12, "grad_norm": 1.5793654918670654, "learning_rate": 3.889996649218852e-05, "loss": 0.0682, "step": 2380 }, { "epoch": 3.13, "grad_norm": 1.8609654903411865, "learning_rate": 3.8857101703790196e-05, "loss": 0.0738, "step": 2385 }, { "epoch": 3.13, "grad_norm": 1.681381344795227, "learning_rate": 3.881417803074676e-05, "loss": 0.0747, "step": 2390 }, { "epoch": 3.14, "grad_norm": 2.0434184074401855, "learning_rate": 3.877119565545891e-05, "loss": 0.0806, "step": 2395 }, { "epoch": 3.15, "grad_norm": 1.8694220781326294, "learning_rate": 3.8728154760576817e-05, "loss": 0.0884, "step": 2400 }, { "epoch": 3.15, "grad_norm": 1.9645694494247437, "learning_rate": 3.868505552899931e-05, "loss": 0.0875, "step": 2405 }, { "epoch": 3.16, "grad_norm": 1.8987983465194702, "learning_rate": 3.8641898143873155e-05, "loss": 0.0917, "step": 2410 }, { "epoch": 3.17, "grad_norm": 2.0280187129974365, "learning_rate": 3.859868278859218e-05, "loss": 0.0878, "step": 2415 }, { "epoch": 3.17, "grad_norm": 2.094468832015991, "learning_rate": 3.855540964679658e-05, "loss": 0.0877, "step": 2420 }, { "epoch": 3.18, "grad_norm": 1.9860283136367798, "learning_rate": 3.851207890237213e-05, "loss": 0.0915, "step": 2425 }, { "epoch": 3.19, "grad_norm": 2.1709773540496826, "learning_rate": 3.846869073944934e-05, "loss": 0.095, "step": 2430 }, { "epoch": 3.19, "grad_norm": 1.8917242288589478, "learning_rate": 3.842524534240276e-05, "loss": 0.0895, "step": 2435 }, { "epoch": 3.2, "grad_norm": 1.9368691444396973, "learning_rate": 3.8381742895850106e-05, "loss": 0.0921, "step": 2440 }, { "epoch": 3.2, "grad_norm": 2.072715997695923, "learning_rate": 3.8338183584651554e-05, "loss": 0.0905, "step": 2445 }, { "epoch": 3.21, "grad_norm": 2.0285987854003906, "learning_rate": 3.8294567593908915e-05, "loss": 0.0941, "step": 2450 }, { "epoch": 3.22, "grad_norm": 2.015815019607544, "learning_rate": 3.825089510896485e-05, "loss": 0.0918, "step": 2455 }, { "epoch": 3.22, "grad_norm": 2.0444390773773193, "learning_rate": 3.820716631540209e-05, "loss": 0.0938, "step": 2460 }, { "epoch": 3.23, "grad_norm": 2.241682291030884, "learning_rate": 3.816338139904265e-05, "loss": 0.0981, "step": 2465 }, { "epoch": 3.24, "grad_norm": 2.1586482524871826, "learning_rate": 3.811954054594702e-05, "loss": 0.0916, "step": 2470 }, { "epoch": 3.24, "grad_norm": 1.968621850013733, "learning_rate": 3.807564394241341e-05, "loss": 0.0886, "step": 2475 }, { "epoch": 3.25, "grad_norm": 2.180476427078247, "learning_rate": 3.8031691774976904e-05, "loss": 0.0955, "step": 2480 }, { "epoch": 3.26, "grad_norm": 1.9769107103347778, "learning_rate": 3.7987684230408735e-05, "loss": 0.0933, "step": 2485 }, { "epoch": 3.26, "grad_norm": 1.7934843301773071, "learning_rate": 3.794362149571545e-05, "loss": 0.087, "step": 2490 }, { "epoch": 3.27, "grad_norm": 2.203385591506958, "learning_rate": 3.7899503758138114e-05, "loss": 0.0927, "step": 2495 }, { "epoch": 3.28, "grad_norm": 2.1554219722747803, "learning_rate": 3.78553312051515e-05, "loss": 0.0917, "step": 2500 }, { "epoch": 3.28, "grad_norm": 2.0353715419769287, "learning_rate": 3.781110402446337e-05, "loss": 0.0961, "step": 2505 }, { "epoch": 3.29, "grad_norm": 2.0579957962036133, "learning_rate": 3.776682240401357e-05, "loss": 0.1026, "step": 2510 }, { "epoch": 3.3, "grad_norm": 2.3313910961151123, "learning_rate": 3.772248653197331e-05, "loss": 0.0908, "step": 2515 }, { "epoch": 3.3, "grad_norm": 2.0414376258850098, "learning_rate": 3.767809659674433e-05, "loss": 0.0909, "step": 2520 }, { "epoch": 3.31, "grad_norm": 2.0286598205566406, "learning_rate": 3.7633652786958105e-05, "loss": 0.0968, "step": 2525 }, { "epoch": 3.32, "grad_norm": 2.367244005203247, "learning_rate": 3.758915529147506e-05, "loss": 0.0923, "step": 2530 }, { "epoch": 3.32, "grad_norm": 1.8567143678665161, "learning_rate": 3.754460429938373e-05, "loss": 0.092, "step": 2535 }, { "epoch": 3.33, "grad_norm": 2.1443471908569336, "learning_rate": 3.7500000000000003e-05, "loss": 0.0926, "step": 2540 }, { "epoch": 3.34, "grad_norm": 2.019869089126587, "learning_rate": 3.745534258286627e-05, "loss": 0.0851, "step": 2545 }, { "epoch": 3.34, "grad_norm": 2.1114661693573, "learning_rate": 3.741063223775066e-05, "loss": 0.0867, "step": 2550 }, { "epoch": 3.35, "grad_norm": 2.078768253326416, "learning_rate": 3.736586915464621e-05, "loss": 0.0949, "step": 2555 }, { "epoch": 3.36, "grad_norm": 1.929516315460205, "learning_rate": 3.732105352377004e-05, "loss": 0.0931, "step": 2560 }, { "epoch": 3.36, "grad_norm": 2.1101810932159424, "learning_rate": 3.727618553556262e-05, "loss": 0.0943, "step": 2565 }, { "epoch": 3.37, "grad_norm": 2.1217589378356934, "learning_rate": 3.723126538068686e-05, "loss": 0.1018, "step": 2570 }, { "epoch": 3.38, "grad_norm": 2.1733384132385254, "learning_rate": 3.718629325002736e-05, "loss": 0.0931, "step": 2575 }, { "epoch": 3.38, "grad_norm": 1.986570119857788, "learning_rate": 3.714126933468959e-05, "loss": 0.0977, "step": 2580 }, { "epoch": 3.39, "grad_norm": 2.1583731174468994, "learning_rate": 3.709619382599909e-05, "loss": 0.0959, "step": 2585 }, { "epoch": 3.4, "grad_norm": 2.0934367179870605, "learning_rate": 3.705106691550063e-05, "loss": 0.093, "step": 2590 }, { "epoch": 3.4, "grad_norm": 2.1203556060791016, "learning_rate": 3.700588879495739e-05, "loss": 0.0969, "step": 2595 }, { "epoch": 3.41, "grad_norm": 1.69106924533844, "learning_rate": 3.6960659656350186e-05, "loss": 0.0935, "step": 2600 }, { "epoch": 3.41, "grad_norm": 2.283950090408325, "learning_rate": 3.6915379691876615e-05, "loss": 0.0961, "step": 2605 }, { "epoch": 3.42, "grad_norm": 2.030233383178711, "learning_rate": 3.6870049093950284e-05, "loss": 0.0968, "step": 2610 }, { "epoch": 3.43, "grad_norm": 2.127016544342041, "learning_rate": 3.682466805519992e-05, "loss": 0.0986, "step": 2615 }, { "epoch": 3.43, "grad_norm": 2.0715529918670654, "learning_rate": 3.677923676846864e-05, "loss": 0.0908, "step": 2620 }, { "epoch": 3.44, "grad_norm": 2.094383955001831, "learning_rate": 3.673375542681305e-05, "loss": 0.0971, "step": 2625 }, { "epoch": 3.45, "grad_norm": 2.1275360584259033, "learning_rate": 3.668822422350247e-05, "loss": 0.1002, "step": 2630 }, { "epoch": 3.45, "grad_norm": 2.068857192993164, "learning_rate": 3.6642643352018116e-05, "loss": 0.0893, "step": 2635 }, { "epoch": 3.46, "grad_norm": 1.982419490814209, "learning_rate": 3.659701300605224e-05, "loss": 0.097, "step": 2640 }, { "epoch": 3.47, "grad_norm": 2.1440236568450928, "learning_rate": 3.6551333379507346e-05, "loss": 0.1063, "step": 2645 }, { "epoch": 3.47, "grad_norm": 1.987056016921997, "learning_rate": 3.650560466649538e-05, "loss": 0.0935, "step": 2650 }, { "epoch": 3.48, "grad_norm": 1.9351013898849487, "learning_rate": 3.645982706133682e-05, "loss": 0.0901, "step": 2655 }, { "epoch": 3.49, "grad_norm": 2.3136351108551025, "learning_rate": 3.641400075855995e-05, "loss": 0.0992, "step": 2660 }, { "epoch": 3.49, "grad_norm": 2.232473611831665, "learning_rate": 3.636812595289998e-05, "loss": 0.104, "step": 2665 }, { "epoch": 3.5, "grad_norm": 2.2792115211486816, "learning_rate": 3.632220283929822e-05, "loss": 0.1002, "step": 2670 }, { "epoch": 3.51, "grad_norm": 2.2950987815856934, "learning_rate": 3.627623161290127e-05, "loss": 0.1014, "step": 2675 }, { "epoch": 3.51, "grad_norm": 1.9094067811965942, "learning_rate": 3.623021246906018e-05, "loss": 0.1012, "step": 2680 }, { "epoch": 3.52, "grad_norm": 1.9479122161865234, "learning_rate": 3.618414560332962e-05, "loss": 0.0971, "step": 2685 }, { "epoch": 3.53, "grad_norm": 1.8091012239456177, "learning_rate": 3.6138031211467044e-05, "loss": 0.103, "step": 2690 }, { "epoch": 3.53, "grad_norm": 2.095745086669922, "learning_rate": 3.609186948943188e-05, "loss": 0.0953, "step": 2695 }, { "epoch": 3.54, "grad_norm": 2.011467218399048, "learning_rate": 3.604566063338467e-05, "loss": 0.1009, "step": 2700 }, { "epoch": 3.55, "grad_norm": 2.2293827533721924, "learning_rate": 3.599940483968625e-05, "loss": 0.0942, "step": 2705 }, { "epoch": 3.55, "grad_norm": 1.9385508298873901, "learning_rate": 3.595310230489692e-05, "loss": 0.0961, "step": 2710 }, { "epoch": 3.56, "grad_norm": 2.123690366744995, "learning_rate": 3.5906753225775586e-05, "loss": 0.0982, "step": 2715 }, { "epoch": 3.57, "grad_norm": 2.051839828491211, "learning_rate": 3.586035779927896e-05, "loss": 0.1023, "step": 2720 }, { "epoch": 3.57, "grad_norm": 2.269162654876709, "learning_rate": 3.581391622256069e-05, "loss": 0.0995, "step": 2725 }, { "epoch": 3.58, "grad_norm": 1.9228086471557617, "learning_rate": 3.576742869297056e-05, "loss": 0.0998, "step": 2730 }, { "epoch": 3.59, "grad_norm": 1.946722149848938, "learning_rate": 3.5720895408053574e-05, "loss": 0.0968, "step": 2735 }, { "epoch": 3.59, "grad_norm": 2.1756057739257812, "learning_rate": 3.567431656554923e-05, "loss": 0.0912, "step": 2740 }, { "epoch": 3.6, "grad_norm": 1.7844388484954834, "learning_rate": 3.562769236339058e-05, "loss": 0.0957, "step": 2745 }, { "epoch": 3.6, "grad_norm": 2.02689528465271, "learning_rate": 3.5581022999703464e-05, "loss": 0.0926, "step": 2750 }, { "epoch": 3.61, "grad_norm": 2.160264730453491, "learning_rate": 3.553430867280557e-05, "loss": 0.0974, "step": 2755 }, { "epoch": 3.62, "grad_norm": 1.962109923362732, "learning_rate": 3.548754958120573e-05, "loss": 0.0969, "step": 2760 }, { "epoch": 3.62, "grad_norm": 1.9709469079971313, "learning_rate": 3.544074592360294e-05, "loss": 0.0969, "step": 2765 }, { "epoch": 3.63, "grad_norm": 1.968109130859375, "learning_rate": 3.5393897898885606e-05, "loss": 0.1024, "step": 2770 }, { "epoch": 3.64, "grad_norm": 1.9209555387496948, "learning_rate": 3.534700570613067e-05, "loss": 0.1017, "step": 2775 }, { "epoch": 3.64, "grad_norm": 2.151937961578369, "learning_rate": 3.530006954460274e-05, "loss": 0.1007, "step": 2780 }, { "epoch": 3.65, "grad_norm": 1.9381475448608398, "learning_rate": 3.525308961375329e-05, "loss": 0.0947, "step": 2785 }, { "epoch": 3.66, "grad_norm": 1.9445606470108032, "learning_rate": 3.520606611321976e-05, "loss": 0.1005, "step": 2790 }, { "epoch": 3.66, "grad_norm": 2.063396692276001, "learning_rate": 3.515899924282478e-05, "loss": 0.1041, "step": 2795 }, { "epoch": 3.67, "grad_norm": 2.594733715057373, "learning_rate": 3.511188920257523e-05, "loss": 0.0985, "step": 2800 }, { "epoch": 3.68, "grad_norm": 2.2250747680664062, "learning_rate": 3.506473619266146e-05, "loss": 0.0956, "step": 2805 }, { "epoch": 3.68, "grad_norm": 2.2515833377838135, "learning_rate": 3.501754041345643e-05, "loss": 0.097, "step": 2810 }, { "epoch": 3.69, "grad_norm": 2.0630807876586914, "learning_rate": 3.497030206551481e-05, "loss": 0.1029, "step": 2815 }, { "epoch": 3.7, "grad_norm": 2.0855114459991455, "learning_rate": 3.492302134957218e-05, "loss": 0.1018, "step": 2820 }, { "epoch": 3.7, "grad_norm": 2.0847525596618652, "learning_rate": 3.487569846654417e-05, "loss": 0.0974, "step": 2825 }, { "epoch": 3.71, "grad_norm": 2.245652675628662, "learning_rate": 3.4828333617525586e-05, "loss": 0.0982, "step": 2830 }, { "epoch": 3.72, "grad_norm": 2.2418930530548096, "learning_rate": 3.4780927003789556e-05, "loss": 0.0984, "step": 2835 }, { "epoch": 3.72, "grad_norm": 2.1843297481536865, "learning_rate": 3.47334788267867e-05, "loss": 0.0971, "step": 2840 }, { "epoch": 3.73, "grad_norm": 2.1401710510253906, "learning_rate": 3.468598928814425e-05, "loss": 0.0983, "step": 2845 }, { "epoch": 3.74, "grad_norm": 2.237949848175049, "learning_rate": 3.4638458589665194e-05, "loss": 0.1012, "step": 2850 }, { "epoch": 3.74, "grad_norm": 2.101795196533203, "learning_rate": 3.459088693332743e-05, "loss": 0.0957, "step": 2855 }, { "epoch": 3.75, "grad_norm": 2.1468005180358887, "learning_rate": 3.454327452128292e-05, "loss": 0.1016, "step": 2860 }, { "epoch": 3.76, "grad_norm": 2.139878034591675, "learning_rate": 3.449562155585679e-05, "loss": 0.0956, "step": 2865 }, { "epoch": 3.76, "grad_norm": 2.2107348442077637, "learning_rate": 3.444792823954651e-05, "loss": 0.1002, "step": 2870 }, { "epoch": 3.77, "grad_norm": 2.1431825160980225, "learning_rate": 3.440019477502101e-05, "loss": 0.0979, "step": 2875 }, { "epoch": 3.78, "grad_norm": 2.027465581893921, "learning_rate": 3.435242136511984e-05, "loss": 0.0988, "step": 2880 }, { "epoch": 3.78, "grad_norm": 1.9341384172439575, "learning_rate": 3.430460821285225e-05, "loss": 0.0945, "step": 2885 }, { "epoch": 3.79, "grad_norm": 2.29193377494812, "learning_rate": 3.425675552139645e-05, "loss": 0.0993, "step": 2890 }, { "epoch": 3.79, "grad_norm": 2.169417381286621, "learning_rate": 3.4208863494098586e-05, "loss": 0.1008, "step": 2895 }, { "epoch": 3.8, "grad_norm": 2.211463212966919, "learning_rate": 3.416093233447201e-05, "loss": 0.0955, "step": 2900 }, { "epoch": 3.81, "grad_norm": 2.137601137161255, "learning_rate": 3.411296224619635e-05, "loss": 0.1063, "step": 2905 }, { "epoch": 3.81, "grad_norm": 2.2494797706604004, "learning_rate": 3.4064953433116675e-05, "loss": 0.1026, "step": 2910 }, { "epoch": 3.82, "grad_norm": 2.046558141708374, "learning_rate": 3.401690609924258e-05, "loss": 0.1007, "step": 2915 }, { "epoch": 3.83, "grad_norm": 2.0194621086120605, "learning_rate": 3.396882044874736e-05, "loss": 0.0924, "step": 2920 }, { "epoch": 3.83, "grad_norm": 2.130725145339966, "learning_rate": 3.392069668596716e-05, "loss": 0.0976, "step": 2925 }, { "epoch": 3.84, "grad_norm": 2.6385340690612793, "learning_rate": 3.3872535015400035e-05, "loss": 0.1062, "step": 2930 }, { "epoch": 3.85, "grad_norm": 1.9961224794387817, "learning_rate": 3.382433564170517e-05, "loss": 0.1025, "step": 2935 }, { "epoch": 3.85, "grad_norm": 2.336229085922241, "learning_rate": 3.377609876970194e-05, "loss": 0.0954, "step": 2940 }, { "epoch": 3.86, "grad_norm": 2.0952861309051514, "learning_rate": 3.372782460436908e-05, "loss": 0.0983, "step": 2945 }, { "epoch": 3.87, "grad_norm": 2.1143643856048584, "learning_rate": 3.367951335084379e-05, "loss": 0.1025, "step": 2950 }, { "epoch": 3.87, "grad_norm": 2.1523244380950928, "learning_rate": 3.363116521442087e-05, "loss": 0.1022, "step": 2955 }, { "epoch": 3.88, "grad_norm": 2.2812914848327637, "learning_rate": 3.3582780400551864e-05, "loss": 0.1058, "step": 2960 }, { "epoch": 3.89, "grad_norm": 2.106767177581787, "learning_rate": 3.353435911484417e-05, "loss": 0.0975, "step": 2965 }, { "epoch": 3.89, "grad_norm": 1.8392610549926758, "learning_rate": 3.348590156306017e-05, "loss": 0.1002, "step": 2970 }, { "epoch": 3.9, "grad_norm": 2.386420726776123, "learning_rate": 3.343740795111634e-05, "loss": 0.1028, "step": 2975 }, { "epoch": 3.91, "grad_norm": 2.3647854328155518, "learning_rate": 3.338887848508242e-05, "loss": 0.098, "step": 2980 }, { "epoch": 3.91, "grad_norm": 2.3350307941436768, "learning_rate": 3.334031337118048e-05, "loss": 0.1101, "step": 2985 }, { "epoch": 3.92, "grad_norm": 2.1085422039031982, "learning_rate": 3.3291712815784104e-05, "loss": 0.1061, "step": 2990 }, { "epoch": 3.93, "grad_norm": 2.0244526863098145, "learning_rate": 3.3243077025417443e-05, "loss": 0.1001, "step": 2995 }, { "epoch": 3.93, "grad_norm": 2.0625123977661133, "learning_rate": 3.319440620675442e-05, "loss": 0.0924, "step": 3000 } ], "logging_steps": 5, "max_steps": 7620, "num_input_tokens_seen": 0, "num_train_epochs": 10, "save_steps": 100, "total_flos": 1.0787435279725363e+17, "train_batch_size": 8, "trial_name": null, "trial_params": null }