|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 3.0, |
|
"eval_steps": 5000, |
|
"global_step": 52737, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.005688605722737357, |
|
"grad_norm": 2.727705717086792, |
|
"learning_rate": 0.0007984982080891974, |
|
"loss": 2.3682, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.011377211445474714, |
|
"grad_norm": 2.5978145599365234, |
|
"learning_rate": 0.0007969812465631342, |
|
"loss": 2.2098, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.01706581716821207, |
|
"grad_norm": 2.488832473754883, |
|
"learning_rate": 0.0007954642850370708, |
|
"loss": 2.1041, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.02275442289094943, |
|
"grad_norm": 2.1368465423583984, |
|
"learning_rate": 0.0007939473235110074, |
|
"loss": 2.0233, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.028443028613686784, |
|
"grad_norm": 2.7280213832855225, |
|
"learning_rate": 0.0007924303619849442, |
|
"loss": 1.9639, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.03413163433642414, |
|
"grad_norm": 2.5257463455200195, |
|
"learning_rate": 0.0007909134004588809, |
|
"loss": 2.0229, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 0.0398202400591615, |
|
"grad_norm": 2.748051404953003, |
|
"learning_rate": 0.0007893964389328175, |
|
"loss": 1.9867, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 0.04550884578189886, |
|
"grad_norm": 2.212047815322876, |
|
"learning_rate": 0.0007878794774067543, |
|
"loss": 1.9354, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 0.05119745150463621, |
|
"grad_norm": 2.423400640487671, |
|
"learning_rate": 0.000786362515880691, |
|
"loss": 1.9127, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 0.05688605722737357, |
|
"grad_norm": 2.379678726196289, |
|
"learning_rate": 0.0007848455543546277, |
|
"loss": 1.8927, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 0.06257466295011092, |
|
"grad_norm": 2.5806541442871094, |
|
"learning_rate": 0.0007833285928285645, |
|
"loss": 1.8471, |
|
"step": 1100 |
|
}, |
|
{ |
|
"epoch": 0.06826326867284828, |
|
"grad_norm": 2.4539499282836914, |
|
"learning_rate": 0.0007818116313025011, |
|
"loss": 1.8296, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 0.07395187439558565, |
|
"grad_norm": 2.7818546295166016, |
|
"learning_rate": 0.0007802946697764378, |
|
"loss": 1.8136, |
|
"step": 1300 |
|
}, |
|
{ |
|
"epoch": 0.079640480118323, |
|
"grad_norm": 2.979959487915039, |
|
"learning_rate": 0.0007787777082503746, |
|
"loss": 1.7844, |
|
"step": 1400 |
|
}, |
|
{ |
|
"epoch": 0.08532908584106036, |
|
"grad_norm": 2.3885879516601562, |
|
"learning_rate": 0.0007772607467243113, |
|
"loss": 1.7627, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 0.09101769156379771, |
|
"grad_norm": 2.2447025775909424, |
|
"learning_rate": 0.0007757437851982479, |
|
"loss": 1.7375, |
|
"step": 1600 |
|
}, |
|
{ |
|
"epoch": 0.09670629728653507, |
|
"grad_norm": 2.184580087661743, |
|
"learning_rate": 0.0007742268236721846, |
|
"loss": 1.7212, |
|
"step": 1700 |
|
}, |
|
{ |
|
"epoch": 0.10239490300927243, |
|
"grad_norm": 2.2506866455078125, |
|
"learning_rate": 0.0007727098621461213, |
|
"loss": 1.7264, |
|
"step": 1800 |
|
}, |
|
{ |
|
"epoch": 0.10808350873200978, |
|
"grad_norm": 2.059812068939209, |
|
"learning_rate": 0.000771192900620058, |
|
"loss": 1.7123, |
|
"step": 1900 |
|
}, |
|
{ |
|
"epoch": 0.11377211445474714, |
|
"grad_norm": 2.3013007640838623, |
|
"learning_rate": 0.0007696759390939948, |
|
"loss": 1.7122, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 0.11946072017748449, |
|
"grad_norm": 2.7073047161102295, |
|
"learning_rate": 0.0007681741471831921, |
|
"loss": 1.6847, |
|
"step": 2100 |
|
}, |
|
{ |
|
"epoch": 0.12514932590022185, |
|
"grad_norm": 2.023949384689331, |
|
"learning_rate": 0.0007666571856571288, |
|
"loss": 1.6941, |
|
"step": 2200 |
|
}, |
|
{ |
|
"epoch": 0.1308379316229592, |
|
"grad_norm": 1.9444501399993896, |
|
"learning_rate": 0.0007651402241310656, |
|
"loss": 1.6682, |
|
"step": 2300 |
|
}, |
|
{ |
|
"epoch": 0.13652653734569656, |
|
"grad_norm": 2.691826105117798, |
|
"learning_rate": 0.0007636232626050023, |
|
"loss": 1.6409, |
|
"step": 2400 |
|
}, |
|
{ |
|
"epoch": 0.1422151430684339, |
|
"grad_norm": 2.483386993408203, |
|
"learning_rate": 0.0007621063010789389, |
|
"loss": 1.662, |
|
"step": 2500 |
|
}, |
|
{ |
|
"epoch": 0.1479037487911713, |
|
"grad_norm": 2.1850545406341553, |
|
"learning_rate": 0.0007605893395528756, |
|
"loss": 1.6532, |
|
"step": 2600 |
|
}, |
|
{ |
|
"epoch": 0.15359235451390865, |
|
"grad_norm": 1.989560842514038, |
|
"learning_rate": 0.0007590723780268123, |
|
"loss": 1.6231, |
|
"step": 2700 |
|
}, |
|
{ |
|
"epoch": 0.159280960236646, |
|
"grad_norm": 2.1362531185150146, |
|
"learning_rate": 0.000757555416500749, |
|
"loss": 1.6019, |
|
"step": 2800 |
|
}, |
|
{ |
|
"epoch": 0.16496956595938336, |
|
"grad_norm": 2.3262641429901123, |
|
"learning_rate": 0.0007560384549746857, |
|
"loss": 1.6103, |
|
"step": 2900 |
|
}, |
|
{ |
|
"epoch": 0.17065817168212072, |
|
"grad_norm": 2.297419309616089, |
|
"learning_rate": 0.0007545214934486224, |
|
"loss": 1.6314, |
|
"step": 3000 |
|
}, |
|
{ |
|
"epoch": 0.17634677740485807, |
|
"grad_norm": 2.1368629932403564, |
|
"learning_rate": 0.0007530045319225591, |
|
"loss": 1.5838, |
|
"step": 3100 |
|
}, |
|
{ |
|
"epoch": 0.18203538312759543, |
|
"grad_norm": 2.3383195400238037, |
|
"learning_rate": 0.0007514875703964959, |
|
"loss": 1.5857, |
|
"step": 3200 |
|
}, |
|
{ |
|
"epoch": 0.18772398885033278, |
|
"grad_norm": 2.149740219116211, |
|
"learning_rate": 0.0007499706088704326, |
|
"loss": 1.6016, |
|
"step": 3300 |
|
}, |
|
{ |
|
"epoch": 0.19341259457307014, |
|
"grad_norm": 2.096703290939331, |
|
"learning_rate": 0.0007484536473443692, |
|
"loss": 1.5904, |
|
"step": 3400 |
|
}, |
|
{ |
|
"epoch": 0.1991012002958075, |
|
"grad_norm": 2.2043957710266113, |
|
"learning_rate": 0.000746936685818306, |
|
"loss": 1.5787, |
|
"step": 3500 |
|
}, |
|
{ |
|
"epoch": 0.20478980601854485, |
|
"grad_norm": 2.6369898319244385, |
|
"learning_rate": 0.0007454197242922427, |
|
"loss": 1.5539, |
|
"step": 3600 |
|
}, |
|
{ |
|
"epoch": 0.2104784117412822, |
|
"grad_norm": 1.9776628017425537, |
|
"learning_rate": 0.0007439027627661794, |
|
"loss": 1.5815, |
|
"step": 3700 |
|
}, |
|
{ |
|
"epoch": 0.21616701746401956, |
|
"grad_norm": 2.2001795768737793, |
|
"learning_rate": 0.0007423858012401161, |
|
"loss": 1.5583, |
|
"step": 3800 |
|
}, |
|
{ |
|
"epoch": 0.22185562318675692, |
|
"grad_norm": 2.2252562046051025, |
|
"learning_rate": 0.0007408688397140527, |
|
"loss": 1.5539, |
|
"step": 3900 |
|
}, |
|
{ |
|
"epoch": 0.22754422890949427, |
|
"grad_norm": 2.15871262550354, |
|
"learning_rate": 0.0007393518781879895, |
|
"loss": 1.5786, |
|
"step": 4000 |
|
}, |
|
{ |
|
"epoch": 0.23323283463223163, |
|
"grad_norm": 2.026066303253174, |
|
"learning_rate": 0.0007378500862771869, |
|
"loss": 1.5452, |
|
"step": 4100 |
|
}, |
|
{ |
|
"epoch": 0.23892144035496898, |
|
"grad_norm": 2.116511583328247, |
|
"learning_rate": 0.0007363331247511235, |
|
"loss": 1.5381, |
|
"step": 4200 |
|
}, |
|
{ |
|
"epoch": 0.24461004607770637, |
|
"grad_norm": 1.9454152584075928, |
|
"learning_rate": 0.0007348161632250602, |
|
"loss": 1.557, |
|
"step": 4300 |
|
}, |
|
{ |
|
"epoch": 0.2502986518004437, |
|
"grad_norm": 1.8668495416641235, |
|
"learning_rate": 0.000733299201698997, |
|
"loss": 1.5406, |
|
"step": 4400 |
|
}, |
|
{ |
|
"epoch": 0.2559872575231811, |
|
"grad_norm": 2.0886125564575195, |
|
"learning_rate": 0.0007317822401729337, |
|
"loss": 1.519, |
|
"step": 4500 |
|
}, |
|
{ |
|
"epoch": 0.2616758632459184, |
|
"grad_norm": 2.5768446922302246, |
|
"learning_rate": 0.0007302652786468704, |
|
"loss": 1.5178, |
|
"step": 4600 |
|
}, |
|
{ |
|
"epoch": 0.2673644689686558, |
|
"grad_norm": 2.4169631004333496, |
|
"learning_rate": 0.000728748317120807, |
|
"loss": 1.5283, |
|
"step": 4700 |
|
}, |
|
{ |
|
"epoch": 0.2730530746913931, |
|
"grad_norm": 2.7676291465759277, |
|
"learning_rate": 0.0007272313555947437, |
|
"loss": 1.5265, |
|
"step": 4800 |
|
}, |
|
{ |
|
"epoch": 0.2787416804141305, |
|
"grad_norm": 1.9152452945709229, |
|
"learning_rate": 0.0007257143940686805, |
|
"loss": 1.5152, |
|
"step": 4900 |
|
}, |
|
{ |
|
"epoch": 0.2844302861368678, |
|
"grad_norm": 2.56608510017395, |
|
"learning_rate": 0.0007241974325426172, |
|
"loss": 1.5084, |
|
"step": 5000 |
|
}, |
|
{ |
|
"epoch": 0.2844302861368678, |
|
"eval_accuracy": 0.636444, |
|
"eval_loss": 1.469992995262146, |
|
"eval_runtime": 85.5907, |
|
"eval_samples_per_second": 2920.879, |
|
"eval_steps_per_second": 11.415, |
|
"step": 5000 |
|
}, |
|
{ |
|
"epoch": 0.2901188918596052, |
|
"grad_norm": 2.2403135299682617, |
|
"learning_rate": 0.0007226804710165538, |
|
"loss": 1.5076, |
|
"step": 5100 |
|
}, |
|
{ |
|
"epoch": 0.2958074975823426, |
|
"grad_norm": 2.058535099029541, |
|
"learning_rate": 0.0007211635094904906, |
|
"loss": 1.4973, |
|
"step": 5200 |
|
}, |
|
{ |
|
"epoch": 0.3014961033050799, |
|
"grad_norm": 1.9374159574508667, |
|
"learning_rate": 0.0007196465479644273, |
|
"loss": 1.5055, |
|
"step": 5300 |
|
}, |
|
{ |
|
"epoch": 0.3071847090278173, |
|
"grad_norm": 1.8894695043563843, |
|
"learning_rate": 0.000718129586438364, |
|
"loss": 1.4996, |
|
"step": 5400 |
|
}, |
|
{ |
|
"epoch": 0.31287331475055463, |
|
"grad_norm": 2.5466501712799072, |
|
"learning_rate": 0.0007166126249123008, |
|
"loss": 1.5006, |
|
"step": 5500 |
|
}, |
|
{ |
|
"epoch": 0.318561920473292, |
|
"grad_norm": 1.9721605777740479, |
|
"learning_rate": 0.0007150956633862374, |
|
"loss": 1.4932, |
|
"step": 5600 |
|
}, |
|
{ |
|
"epoch": 0.32425052619602934, |
|
"grad_norm": 1.8921763896942139, |
|
"learning_rate": 0.0007135787018601741, |
|
"loss": 1.4762, |
|
"step": 5700 |
|
}, |
|
{ |
|
"epoch": 0.3299391319187667, |
|
"grad_norm": 2.49052357673645, |
|
"learning_rate": 0.0007120617403341108, |
|
"loss": 1.4636, |
|
"step": 5800 |
|
}, |
|
{ |
|
"epoch": 0.33562773764150405, |
|
"grad_norm": 1.8825891017913818, |
|
"learning_rate": 0.0007105447788080475, |
|
"loss": 1.4903, |
|
"step": 5900 |
|
}, |
|
{ |
|
"epoch": 0.34131634336424144, |
|
"grad_norm": 1.9227776527404785, |
|
"learning_rate": 0.0007090278172819842, |
|
"loss": 1.4602, |
|
"step": 6000 |
|
}, |
|
{ |
|
"epoch": 0.34700494908697876, |
|
"grad_norm": 2.173774242401123, |
|
"learning_rate": 0.0007075108557559209, |
|
"loss": 1.4581, |
|
"step": 6100 |
|
}, |
|
{ |
|
"epoch": 0.35269355480971615, |
|
"grad_norm": 1.9840656518936157, |
|
"learning_rate": 0.0007059938942298576, |
|
"loss": 1.4504, |
|
"step": 6200 |
|
}, |
|
{ |
|
"epoch": 0.3583821605324535, |
|
"grad_norm": 2.368171453475952, |
|
"learning_rate": 0.000704492102319055, |
|
"loss": 1.4659, |
|
"step": 6300 |
|
}, |
|
{ |
|
"epoch": 0.36407076625519086, |
|
"grad_norm": 2.005125045776367, |
|
"learning_rate": 0.0007029751407929917, |
|
"loss": 1.4698, |
|
"step": 6400 |
|
}, |
|
{ |
|
"epoch": 0.3697593719779282, |
|
"grad_norm": 1.8724095821380615, |
|
"learning_rate": 0.0007014581792669284, |
|
"loss": 1.4429, |
|
"step": 6500 |
|
}, |
|
{ |
|
"epoch": 0.37544797770066557, |
|
"grad_norm": 1.8412431478500366, |
|
"learning_rate": 0.0006999412177408651, |
|
"loss": 1.4368, |
|
"step": 6600 |
|
}, |
|
{ |
|
"epoch": 0.3811365834234029, |
|
"grad_norm": 1.9016755819320679, |
|
"learning_rate": 0.0006984242562148018, |
|
"loss": 1.4351, |
|
"step": 6700 |
|
}, |
|
{ |
|
"epoch": 0.3868251891461403, |
|
"grad_norm": 1.9896953105926514, |
|
"learning_rate": 0.0006969072946887384, |
|
"loss": 1.4563, |
|
"step": 6800 |
|
}, |
|
{ |
|
"epoch": 0.39251379486887766, |
|
"grad_norm": 2.3341548442840576, |
|
"learning_rate": 0.0006953903331626751, |
|
"loss": 1.4457, |
|
"step": 6900 |
|
}, |
|
{ |
|
"epoch": 0.398202400591615, |
|
"grad_norm": 1.95259690284729, |
|
"learning_rate": 0.0006938733716366119, |
|
"loss": 1.4636, |
|
"step": 7000 |
|
}, |
|
{ |
|
"epoch": 0.4038910063143524, |
|
"grad_norm": 1.8444461822509766, |
|
"learning_rate": 0.0006923564101105486, |
|
"loss": 1.4418, |
|
"step": 7100 |
|
}, |
|
{ |
|
"epoch": 0.4095796120370897, |
|
"grad_norm": 1.9170624017715454, |
|
"learning_rate": 0.0006908394485844853, |
|
"loss": 1.4267, |
|
"step": 7200 |
|
}, |
|
{ |
|
"epoch": 0.4152682177598271, |
|
"grad_norm": 1.6293827295303345, |
|
"learning_rate": 0.000689322487058422, |
|
"loss": 1.4474, |
|
"step": 7300 |
|
}, |
|
{ |
|
"epoch": 0.4209568234825644, |
|
"grad_norm": 2.2202467918395996, |
|
"learning_rate": 0.0006878055255323587, |
|
"loss": 1.4166, |
|
"step": 7400 |
|
}, |
|
{ |
|
"epoch": 0.4266454292053018, |
|
"grad_norm": 1.9069397449493408, |
|
"learning_rate": 0.0006862885640062954, |
|
"loss": 1.4194, |
|
"step": 7500 |
|
}, |
|
{ |
|
"epoch": 0.4323340349280391, |
|
"grad_norm": 2.0205297470092773, |
|
"learning_rate": 0.0006847716024802322, |
|
"loss": 1.4328, |
|
"step": 7600 |
|
}, |
|
{ |
|
"epoch": 0.4380226406507765, |
|
"grad_norm": 1.6736252307891846, |
|
"learning_rate": 0.0006832546409541689, |
|
"loss": 1.4309, |
|
"step": 7700 |
|
}, |
|
{ |
|
"epoch": 0.44371124637351383, |
|
"grad_norm": 1.7010937929153442, |
|
"learning_rate": 0.0006817376794281055, |
|
"loss": 1.412, |
|
"step": 7800 |
|
}, |
|
{ |
|
"epoch": 0.4493998520962512, |
|
"grad_norm": 2.748424768447876, |
|
"learning_rate": 0.0006802207179020422, |
|
"loss": 1.421, |
|
"step": 7900 |
|
}, |
|
{ |
|
"epoch": 0.45508845781898855, |
|
"grad_norm": 1.908728837966919, |
|
"learning_rate": 0.0006787037563759789, |
|
"loss": 1.4172, |
|
"step": 8000 |
|
}, |
|
{ |
|
"epoch": 0.46077706354172593, |
|
"grad_norm": 1.8672014474868774, |
|
"learning_rate": 0.0006771867948499157, |
|
"loss": 1.4448, |
|
"step": 8100 |
|
}, |
|
{ |
|
"epoch": 0.46646566926446326, |
|
"grad_norm": 2.128519058227539, |
|
"learning_rate": 0.0006756698333238524, |
|
"loss": 1.4158, |
|
"step": 8200 |
|
}, |
|
{ |
|
"epoch": 0.47215427498720064, |
|
"grad_norm": 1.7498713731765747, |
|
"learning_rate": 0.000674152871797789, |
|
"loss": 1.412, |
|
"step": 8300 |
|
}, |
|
{ |
|
"epoch": 0.47784288070993797, |
|
"grad_norm": 1.7801289558410645, |
|
"learning_rate": 0.0006726510798869864, |
|
"loss": 1.4146, |
|
"step": 8400 |
|
}, |
|
{ |
|
"epoch": 0.48353148643267535, |
|
"grad_norm": 1.9360538721084595, |
|
"learning_rate": 0.0006711341183609232, |
|
"loss": 1.4252, |
|
"step": 8500 |
|
}, |
|
{ |
|
"epoch": 0.48922009215541273, |
|
"grad_norm": 2.3669304847717285, |
|
"learning_rate": 0.0006696171568348598, |
|
"loss": 1.4057, |
|
"step": 8600 |
|
}, |
|
{ |
|
"epoch": 0.49490869787815006, |
|
"grad_norm": 1.7751379013061523, |
|
"learning_rate": 0.0006681001953087965, |
|
"loss": 1.4049, |
|
"step": 8700 |
|
}, |
|
{ |
|
"epoch": 0.5005973036008874, |
|
"grad_norm": 2.5389885902404785, |
|
"learning_rate": 0.0006665832337827332, |
|
"loss": 1.3837, |
|
"step": 8800 |
|
}, |
|
{ |
|
"epoch": 0.5062859093236248, |
|
"grad_norm": 2.5082690715789795, |
|
"learning_rate": 0.0006650662722566699, |
|
"loss": 1.3924, |
|
"step": 8900 |
|
}, |
|
{ |
|
"epoch": 0.5119745150463622, |
|
"grad_norm": 2.011589527130127, |
|
"learning_rate": 0.0006635493107306066, |
|
"loss": 1.3956, |
|
"step": 9000 |
|
}, |
|
{ |
|
"epoch": 0.5176631207690995, |
|
"grad_norm": 1.819793939590454, |
|
"learning_rate": 0.0006620323492045433, |
|
"loss": 1.4063, |
|
"step": 9100 |
|
}, |
|
{ |
|
"epoch": 0.5233517264918368, |
|
"grad_norm": 2.081247568130493, |
|
"learning_rate": 0.00066051538767848, |
|
"loss": 1.4145, |
|
"step": 9200 |
|
}, |
|
{ |
|
"epoch": 0.5290403322145742, |
|
"grad_norm": 2.151563882827759, |
|
"learning_rate": 0.0006589984261524168, |
|
"loss": 1.4002, |
|
"step": 9300 |
|
}, |
|
{ |
|
"epoch": 0.5347289379373116, |
|
"grad_norm": 1.9170759916305542, |
|
"learning_rate": 0.0006574814646263535, |
|
"loss": 1.3863, |
|
"step": 9400 |
|
}, |
|
{ |
|
"epoch": 0.5404175436600489, |
|
"grad_norm": 1.6435186862945557, |
|
"learning_rate": 0.0006559645031002901, |
|
"loss": 1.3888, |
|
"step": 9500 |
|
}, |
|
{ |
|
"epoch": 0.5461061493827862, |
|
"grad_norm": 1.8130972385406494, |
|
"learning_rate": 0.0006544475415742268, |
|
"loss": 1.3904, |
|
"step": 9600 |
|
}, |
|
{ |
|
"epoch": 0.5517947551055237, |
|
"grad_norm": 1.8200345039367676, |
|
"learning_rate": 0.0006529305800481636, |
|
"loss": 1.3647, |
|
"step": 9700 |
|
}, |
|
{ |
|
"epoch": 0.557483360828261, |
|
"grad_norm": 1.7286423444747925, |
|
"learning_rate": 0.0006514136185221003, |
|
"loss": 1.3815, |
|
"step": 9800 |
|
}, |
|
{ |
|
"epoch": 0.5631719665509983, |
|
"grad_norm": 2.345879554748535, |
|
"learning_rate": 0.000649896656996037, |
|
"loss": 1.3919, |
|
"step": 9900 |
|
}, |
|
{ |
|
"epoch": 0.5688605722737357, |
|
"grad_norm": 1.8189209699630737, |
|
"learning_rate": 0.0006483796954699736, |
|
"loss": 1.3684, |
|
"step": 10000 |
|
}, |
|
{ |
|
"epoch": 0.5688605722737357, |
|
"eval_accuracy": 0.667392, |
|
"eval_loss": 1.3353288173675537, |
|
"eval_runtime": 85.0475, |
|
"eval_samples_per_second": 2939.533, |
|
"eval_steps_per_second": 11.488, |
|
"step": 10000 |
|
}, |
|
{ |
|
"epoch": 0.5745491779964731, |
|
"grad_norm": 1.7264429330825806, |
|
"learning_rate": 0.0006468627339439103, |
|
"loss": 1.4044, |
|
"step": 10100 |
|
}, |
|
{ |
|
"epoch": 0.5802377837192104, |
|
"grad_norm": 1.8806540966033936, |
|
"learning_rate": 0.0006453457724178471, |
|
"loss": 1.3746, |
|
"step": 10200 |
|
}, |
|
{ |
|
"epoch": 0.5859263894419477, |
|
"grad_norm": 1.7714815139770508, |
|
"learning_rate": 0.0006438288108917838, |
|
"loss": 1.3837, |
|
"step": 10300 |
|
}, |
|
{ |
|
"epoch": 0.5916149951646852, |
|
"grad_norm": 1.713157057762146, |
|
"learning_rate": 0.0006423118493657205, |
|
"loss": 1.3939, |
|
"step": 10400 |
|
}, |
|
{ |
|
"epoch": 0.5973036008874225, |
|
"grad_norm": 2.169168472290039, |
|
"learning_rate": 0.0006408100574549179, |
|
"loss": 1.3658, |
|
"step": 10500 |
|
}, |
|
{ |
|
"epoch": 0.6029922066101598, |
|
"grad_norm": 1.727501630783081, |
|
"learning_rate": 0.0006392930959288546, |
|
"loss": 1.3907, |
|
"step": 10600 |
|
}, |
|
{ |
|
"epoch": 0.6086808123328972, |
|
"grad_norm": 2.0120322704315186, |
|
"learning_rate": 0.0006377761344027913, |
|
"loss": 1.3757, |
|
"step": 10700 |
|
}, |
|
{ |
|
"epoch": 0.6143694180556346, |
|
"grad_norm": 1.799139142036438, |
|
"learning_rate": 0.0006362591728767279, |
|
"loss": 1.3803, |
|
"step": 10800 |
|
}, |
|
{ |
|
"epoch": 0.6200580237783719, |
|
"grad_norm": 1.8817808628082275, |
|
"learning_rate": 0.0006347422113506646, |
|
"loss": 1.3702, |
|
"step": 10900 |
|
}, |
|
{ |
|
"epoch": 0.6257466295011093, |
|
"grad_norm": 2.1144518852233887, |
|
"learning_rate": 0.0006332252498246013, |
|
"loss": 1.3832, |
|
"step": 11000 |
|
}, |
|
{ |
|
"epoch": 0.6314352352238466, |
|
"grad_norm": 2.1396071910858154, |
|
"learning_rate": 0.0006317082882985381, |
|
"loss": 1.3611, |
|
"step": 11100 |
|
}, |
|
{ |
|
"epoch": 0.637123840946584, |
|
"grad_norm": 1.6794757843017578, |
|
"learning_rate": 0.0006301913267724747, |
|
"loss": 1.368, |
|
"step": 11200 |
|
}, |
|
{ |
|
"epoch": 0.6428124466693214, |
|
"grad_norm": 2.268433094024658, |
|
"learning_rate": 0.0006286743652464114, |
|
"loss": 1.3498, |
|
"step": 11300 |
|
}, |
|
{ |
|
"epoch": 0.6485010523920587, |
|
"grad_norm": 1.8515706062316895, |
|
"learning_rate": 0.0006271574037203482, |
|
"loss": 1.3489, |
|
"step": 11400 |
|
}, |
|
{ |
|
"epoch": 0.654189658114796, |
|
"grad_norm": 2.482171058654785, |
|
"learning_rate": 0.0006256404421942849, |
|
"loss": 1.3501, |
|
"step": 11500 |
|
}, |
|
{ |
|
"epoch": 0.6598782638375335, |
|
"grad_norm": 1.9485667943954468, |
|
"learning_rate": 0.0006241234806682216, |
|
"loss": 1.3483, |
|
"step": 11600 |
|
}, |
|
{ |
|
"epoch": 0.6655668695602708, |
|
"grad_norm": 1.8601367473602295, |
|
"learning_rate": 0.0006226065191421583, |
|
"loss": 1.3392, |
|
"step": 11700 |
|
}, |
|
{ |
|
"epoch": 0.6712554752830081, |
|
"grad_norm": 1.870851993560791, |
|
"learning_rate": 0.000621089557616095, |
|
"loss": 1.352, |
|
"step": 11800 |
|
}, |
|
{ |
|
"epoch": 0.6769440810057455, |
|
"grad_norm": 1.9454014301300049, |
|
"learning_rate": 0.0006195725960900317, |
|
"loss": 1.3537, |
|
"step": 11900 |
|
}, |
|
{ |
|
"epoch": 0.6826326867284829, |
|
"grad_norm": 1.9180669784545898, |
|
"learning_rate": 0.0006180556345639685, |
|
"loss": 1.3541, |
|
"step": 12000 |
|
}, |
|
{ |
|
"epoch": 0.6883212924512202, |
|
"grad_norm": 1.7796809673309326, |
|
"learning_rate": 0.0006165386730379051, |
|
"loss": 1.331, |
|
"step": 12100 |
|
}, |
|
{ |
|
"epoch": 0.6940098981739575, |
|
"grad_norm": 2.040998935699463, |
|
"learning_rate": 0.0006150217115118417, |
|
"loss": 1.3214, |
|
"step": 12200 |
|
}, |
|
{ |
|
"epoch": 0.699698503896695, |
|
"grad_norm": 1.7188791036605835, |
|
"learning_rate": 0.0006135047499857785, |
|
"loss": 1.3577, |
|
"step": 12300 |
|
}, |
|
{ |
|
"epoch": 0.7053871096194323, |
|
"grad_norm": 1.9152625799179077, |
|
"learning_rate": 0.0006119877884597152, |
|
"loss": 1.3682, |
|
"step": 12400 |
|
}, |
|
{ |
|
"epoch": 0.7110757153421696, |
|
"grad_norm": 2.150810718536377, |
|
"learning_rate": 0.0006104708269336519, |
|
"loss": 1.3388, |
|
"step": 12500 |
|
}, |
|
{ |
|
"epoch": 0.716764321064907, |
|
"grad_norm": 1.97470223903656, |
|
"learning_rate": 0.0006089538654075887, |
|
"loss": 1.3319, |
|
"step": 12600 |
|
}, |
|
{ |
|
"epoch": 0.7224529267876444, |
|
"grad_norm": 1.663122296333313, |
|
"learning_rate": 0.0006074369038815253, |
|
"loss": 1.3593, |
|
"step": 12700 |
|
}, |
|
{ |
|
"epoch": 0.7281415325103817, |
|
"grad_norm": 1.6453677415847778, |
|
"learning_rate": 0.0006059351119707227, |
|
"loss": 1.3592, |
|
"step": 12800 |
|
}, |
|
{ |
|
"epoch": 0.733830138233119, |
|
"grad_norm": 1.6896419525146484, |
|
"learning_rate": 0.0006044181504446595, |
|
"loss": 1.3183, |
|
"step": 12900 |
|
}, |
|
{ |
|
"epoch": 0.7395187439558564, |
|
"grad_norm": 1.7903008460998535, |
|
"learning_rate": 0.000602901188918596, |
|
"loss": 1.3373, |
|
"step": 13000 |
|
}, |
|
{ |
|
"epoch": 0.7452073496785938, |
|
"grad_norm": 2.2026655673980713, |
|
"learning_rate": 0.0006013842273925327, |
|
"loss": 1.3403, |
|
"step": 13100 |
|
}, |
|
{ |
|
"epoch": 0.7508959554013311, |
|
"grad_norm": 1.9204201698303223, |
|
"learning_rate": 0.0005998672658664695, |
|
"loss": 1.3199, |
|
"step": 13200 |
|
}, |
|
{ |
|
"epoch": 0.7565845611240685, |
|
"grad_norm": 1.946899652481079, |
|
"learning_rate": 0.0005983503043404062, |
|
"loss": 1.3298, |
|
"step": 13300 |
|
}, |
|
{ |
|
"epoch": 0.7622731668468058, |
|
"grad_norm": 2.019131898880005, |
|
"learning_rate": 0.0005968333428143428, |
|
"loss": 1.3449, |
|
"step": 13400 |
|
}, |
|
{ |
|
"epoch": 0.7679617725695432, |
|
"grad_norm": 1.848008155822754, |
|
"learning_rate": 0.0005953163812882796, |
|
"loss": 1.3206, |
|
"step": 13500 |
|
}, |
|
{ |
|
"epoch": 0.7736503782922806, |
|
"grad_norm": 2.373288631439209, |
|
"learning_rate": 0.0005937994197622163, |
|
"loss": 1.3564, |
|
"step": 13600 |
|
}, |
|
{ |
|
"epoch": 0.7793389840150179, |
|
"grad_norm": 2.556985855102539, |
|
"learning_rate": 0.000592282458236153, |
|
"loss": 1.3254, |
|
"step": 13700 |
|
}, |
|
{ |
|
"epoch": 0.7850275897377553, |
|
"grad_norm": 1.8957433700561523, |
|
"learning_rate": 0.0005907654967100898, |
|
"loss": 1.3498, |
|
"step": 13800 |
|
}, |
|
{ |
|
"epoch": 0.7907161954604927, |
|
"grad_norm": 1.7315127849578857, |
|
"learning_rate": 0.0005892485351840264, |
|
"loss": 1.3249, |
|
"step": 13900 |
|
}, |
|
{ |
|
"epoch": 0.79640480118323, |
|
"grad_norm": 1.973764419555664, |
|
"learning_rate": 0.0005877315736579631, |
|
"loss": 1.3305, |
|
"step": 14000 |
|
}, |
|
{ |
|
"epoch": 0.8020934069059673, |
|
"grad_norm": 1.711145281791687, |
|
"learning_rate": 0.0005862146121318999, |
|
"loss": 1.3011, |
|
"step": 14100 |
|
}, |
|
{ |
|
"epoch": 0.8077820126287047, |
|
"grad_norm": 1.8515042066574097, |
|
"learning_rate": 0.0005846976506058365, |
|
"loss": 1.3195, |
|
"step": 14200 |
|
}, |
|
{ |
|
"epoch": 0.8134706183514421, |
|
"grad_norm": 1.6278700828552246, |
|
"learning_rate": 0.0005831806890797733, |
|
"loss": 1.3308, |
|
"step": 14300 |
|
}, |
|
{ |
|
"epoch": 0.8191592240741794, |
|
"grad_norm": 1.444455623626709, |
|
"learning_rate": 0.0005816637275537099, |
|
"loss": 1.3119, |
|
"step": 14400 |
|
}, |
|
{ |
|
"epoch": 0.8248478297969167, |
|
"grad_norm": 1.6277796030044556, |
|
"learning_rate": 0.0005801467660276466, |
|
"loss": 1.3227, |
|
"step": 14500 |
|
}, |
|
{ |
|
"epoch": 0.8305364355196542, |
|
"grad_norm": 1.8428665399551392, |
|
"learning_rate": 0.0005786298045015834, |
|
"loss": 1.3336, |
|
"step": 14600 |
|
}, |
|
{ |
|
"epoch": 0.8362250412423915, |
|
"grad_norm": 1.6377763748168945, |
|
"learning_rate": 0.0005771128429755201, |
|
"loss": 1.3141, |
|
"step": 14700 |
|
}, |
|
{ |
|
"epoch": 0.8419136469651288, |
|
"grad_norm": 1.7305645942687988, |
|
"learning_rate": 0.0005755958814494568, |
|
"loss": 1.3062, |
|
"step": 14800 |
|
}, |
|
{ |
|
"epoch": 0.8476022526878662, |
|
"grad_norm": 2.469701051712036, |
|
"learning_rate": 0.0005740940895386541, |
|
"loss": 1.3074, |
|
"step": 14900 |
|
}, |
|
{ |
|
"epoch": 0.8532908584106036, |
|
"grad_norm": 1.952755331993103, |
|
"learning_rate": 0.0005725771280125909, |
|
"loss": 1.3568, |
|
"step": 15000 |
|
}, |
|
{ |
|
"epoch": 0.8532908584106036, |
|
"eval_accuracy": 0.68038, |
|
"eval_loss": 1.2764052152633667, |
|
"eval_runtime": 82.5452, |
|
"eval_samples_per_second": 3028.643, |
|
"eval_steps_per_second": 11.836, |
|
"step": 15000 |
|
}, |
|
{ |
|
"epoch": 0.8589794641333409, |
|
"grad_norm": 3.221471071243286, |
|
"learning_rate": 0.0005710601664865274, |
|
"loss": 1.3341, |
|
"step": 15100 |
|
}, |
|
{ |
|
"epoch": 0.8646680698560782, |
|
"grad_norm": 2.2455317974090576, |
|
"learning_rate": 0.0005695432049604642, |
|
"loss": 1.3276, |
|
"step": 15200 |
|
}, |
|
{ |
|
"epoch": 0.8703566755788157, |
|
"grad_norm": 1.8076684474945068, |
|
"learning_rate": 0.0005680262434344009, |
|
"loss": 1.2922, |
|
"step": 15300 |
|
}, |
|
{ |
|
"epoch": 0.876045281301553, |
|
"grad_norm": 1.701774001121521, |
|
"learning_rate": 0.0005665092819083376, |
|
"loss": 1.3003, |
|
"step": 15400 |
|
}, |
|
{ |
|
"epoch": 0.8817338870242903, |
|
"grad_norm": 1.5403673648834229, |
|
"learning_rate": 0.0005649923203822744, |
|
"loss": 1.3207, |
|
"step": 15500 |
|
}, |
|
{ |
|
"epoch": 0.8874224927470277, |
|
"grad_norm": 1.9462639093399048, |
|
"learning_rate": 0.000563475358856211, |
|
"loss": 1.3098, |
|
"step": 15600 |
|
}, |
|
{ |
|
"epoch": 0.8931110984697651, |
|
"grad_norm": 1.6688456535339355, |
|
"learning_rate": 0.0005619583973301477, |
|
"loss": 1.2993, |
|
"step": 15700 |
|
}, |
|
{ |
|
"epoch": 0.8987997041925024, |
|
"grad_norm": 1.6060837507247925, |
|
"learning_rate": 0.0005604414358040845, |
|
"loss": 1.3145, |
|
"step": 15800 |
|
}, |
|
{ |
|
"epoch": 0.9044883099152398, |
|
"grad_norm": 1.8593111038208008, |
|
"learning_rate": 0.0005589244742780212, |
|
"loss": 1.2836, |
|
"step": 15900 |
|
}, |
|
{ |
|
"epoch": 0.9101769156379771, |
|
"grad_norm": 2.035261869430542, |
|
"learning_rate": 0.0005574075127519579, |
|
"loss": 1.3125, |
|
"step": 16000 |
|
}, |
|
{ |
|
"epoch": 0.9158655213607145, |
|
"grad_norm": 1.6091046333312988, |
|
"learning_rate": 0.0005558905512258946, |
|
"loss": 1.2868, |
|
"step": 16100 |
|
}, |
|
{ |
|
"epoch": 0.9215541270834519, |
|
"grad_norm": 1.656204104423523, |
|
"learning_rate": 0.0005543735896998313, |
|
"loss": 1.3075, |
|
"step": 16200 |
|
}, |
|
{ |
|
"epoch": 0.9272427328061892, |
|
"grad_norm": 1.5555946826934814, |
|
"learning_rate": 0.0005528566281737679, |
|
"loss": 1.2963, |
|
"step": 16300 |
|
}, |
|
{ |
|
"epoch": 0.9329313385289265, |
|
"grad_norm": 1.7379626035690308, |
|
"learning_rate": 0.0005513396666477047, |
|
"loss": 1.2905, |
|
"step": 16400 |
|
}, |
|
{ |
|
"epoch": 0.938619944251664, |
|
"grad_norm": 1.5103166103363037, |
|
"learning_rate": 0.0005498227051216414, |
|
"loss": 1.2848, |
|
"step": 16500 |
|
}, |
|
{ |
|
"epoch": 0.9443085499744013, |
|
"grad_norm": 1.5895978212356567, |
|
"learning_rate": 0.000548305743595578, |
|
"loss": 1.293, |
|
"step": 16600 |
|
}, |
|
{ |
|
"epoch": 0.9499971556971386, |
|
"grad_norm": 1.6526978015899658, |
|
"learning_rate": 0.0005467887820695148, |
|
"loss": 1.288, |
|
"step": 16700 |
|
}, |
|
{ |
|
"epoch": 0.9556857614198759, |
|
"grad_norm": 1.7471717596054077, |
|
"learning_rate": 0.0005452718205434515, |
|
"loss": 1.3099, |
|
"step": 16800 |
|
}, |
|
{ |
|
"epoch": 0.9613743671426134, |
|
"grad_norm": 1.5995450019836426, |
|
"learning_rate": 0.0005437700286326488, |
|
"loss": 1.2886, |
|
"step": 16900 |
|
}, |
|
{ |
|
"epoch": 0.9670629728653507, |
|
"grad_norm": 1.7462047338485718, |
|
"learning_rate": 0.0005422530671065856, |
|
"loss": 1.317, |
|
"step": 17000 |
|
}, |
|
{ |
|
"epoch": 0.972751578588088, |
|
"grad_norm": 1.5739308595657349, |
|
"learning_rate": 0.0005407361055805223, |
|
"loss": 1.2997, |
|
"step": 17100 |
|
}, |
|
{ |
|
"epoch": 0.9784401843108255, |
|
"grad_norm": 1.6608139276504517, |
|
"learning_rate": 0.0005392191440544589, |
|
"loss": 1.3037, |
|
"step": 17200 |
|
}, |
|
{ |
|
"epoch": 0.9841287900335628, |
|
"grad_norm": 1.7515637874603271, |
|
"learning_rate": 0.0005377021825283956, |
|
"loss": 1.302, |
|
"step": 17300 |
|
}, |
|
{ |
|
"epoch": 0.9898173957563001, |
|
"grad_norm": 1.572986364364624, |
|
"learning_rate": 0.0005361852210023323, |
|
"loss": 1.2945, |
|
"step": 17400 |
|
}, |
|
{ |
|
"epoch": 0.9955060014790375, |
|
"grad_norm": 1.9207016229629517, |
|
"learning_rate": 0.000534668259476269, |
|
"loss": 1.2747, |
|
"step": 17500 |
|
}, |
|
{ |
|
"epoch": 1.0011946072017748, |
|
"grad_norm": 1.9010945558547974, |
|
"learning_rate": 0.0005331512979502058, |
|
"loss": 1.263, |
|
"step": 17600 |
|
}, |
|
{ |
|
"epoch": 1.0068832129245122, |
|
"grad_norm": 2.4259393215179443, |
|
"learning_rate": 0.0005316343364241425, |
|
"loss": 1.2741, |
|
"step": 17700 |
|
}, |
|
{ |
|
"epoch": 1.0125718186472497, |
|
"grad_norm": 2.5002028942108154, |
|
"learning_rate": 0.0005301173748980791, |
|
"loss": 1.2686, |
|
"step": 17800 |
|
}, |
|
{ |
|
"epoch": 1.0182604243699869, |
|
"grad_norm": 1.7075704336166382, |
|
"learning_rate": 0.0005286004133720159, |
|
"loss": 1.2661, |
|
"step": 17900 |
|
}, |
|
{ |
|
"epoch": 1.0239490300927243, |
|
"grad_norm": 1.7390458583831787, |
|
"learning_rate": 0.0005270834518459526, |
|
"loss": 1.2698, |
|
"step": 18000 |
|
}, |
|
{ |
|
"epoch": 1.0296376358154615, |
|
"grad_norm": 1.980185627937317, |
|
"learning_rate": 0.0005255664903198893, |
|
"loss": 1.2569, |
|
"step": 18100 |
|
}, |
|
{ |
|
"epoch": 1.035326241538199, |
|
"grad_norm": 1.79970383644104, |
|
"learning_rate": 0.0005240495287938261, |
|
"loss": 1.2738, |
|
"step": 18200 |
|
}, |
|
{ |
|
"epoch": 1.0410148472609364, |
|
"grad_norm": 1.6184749603271484, |
|
"learning_rate": 0.0005225325672677627, |
|
"loss": 1.2637, |
|
"step": 18300 |
|
}, |
|
{ |
|
"epoch": 1.0467034529836736, |
|
"grad_norm": 2.3463358879089355, |
|
"learning_rate": 0.0005210156057416993, |
|
"loss": 1.2665, |
|
"step": 18400 |
|
}, |
|
{ |
|
"epoch": 1.052392058706411, |
|
"grad_norm": 1.8550745248794556, |
|
"learning_rate": 0.0005194986442156361, |
|
"loss": 1.2664, |
|
"step": 18500 |
|
}, |
|
{ |
|
"epoch": 1.0580806644291485, |
|
"grad_norm": 1.8582580089569092, |
|
"learning_rate": 0.0005179816826895728, |
|
"loss": 1.2442, |
|
"step": 18600 |
|
}, |
|
{ |
|
"epoch": 1.0637692701518857, |
|
"grad_norm": 1.88007390499115, |
|
"learning_rate": 0.0005164647211635095, |
|
"loss": 1.2536, |
|
"step": 18700 |
|
}, |
|
{ |
|
"epoch": 1.0694578758746232, |
|
"grad_norm": 1.804671287536621, |
|
"learning_rate": 0.0005149477596374462, |
|
"loss": 1.2459, |
|
"step": 18800 |
|
}, |
|
{ |
|
"epoch": 1.0751464815973604, |
|
"grad_norm": 1.7329107522964478, |
|
"learning_rate": 0.0005134307981113829, |
|
"loss": 1.2499, |
|
"step": 18900 |
|
}, |
|
{ |
|
"epoch": 1.0808350873200978, |
|
"grad_norm": 1.693323016166687, |
|
"learning_rate": 0.0005119290062005802, |
|
"loss": 1.25, |
|
"step": 19000 |
|
}, |
|
{ |
|
"epoch": 1.0865236930428352, |
|
"grad_norm": 1.600060224533081, |
|
"learning_rate": 0.000510412044674517, |
|
"loss": 1.2515, |
|
"step": 19100 |
|
}, |
|
{ |
|
"epoch": 1.0922122987655725, |
|
"grad_norm": 1.8084614276885986, |
|
"learning_rate": 0.0005088950831484537, |
|
"loss": 1.246, |
|
"step": 19200 |
|
}, |
|
{ |
|
"epoch": 1.09790090448831, |
|
"grad_norm": 1.8022205829620361, |
|
"learning_rate": 0.0005073781216223904, |
|
"loss": 1.2597, |
|
"step": 19300 |
|
}, |
|
{ |
|
"epoch": 1.1035895102110473, |
|
"grad_norm": 1.6137562990188599, |
|
"learning_rate": 0.0005058611600963271, |
|
"loss": 1.2685, |
|
"step": 19400 |
|
}, |
|
{ |
|
"epoch": 1.1092781159337846, |
|
"grad_norm": 1.7756201028823853, |
|
"learning_rate": 0.0005043441985702637, |
|
"loss": 1.2606, |
|
"step": 19500 |
|
}, |
|
{ |
|
"epoch": 1.114966721656522, |
|
"grad_norm": 1.8828805685043335, |
|
"learning_rate": 0.0005028272370442004, |
|
"loss": 1.2582, |
|
"step": 19600 |
|
}, |
|
{ |
|
"epoch": 1.1206553273792594, |
|
"grad_norm": 1.6829185485839844, |
|
"learning_rate": 0.0005013102755181372, |
|
"loss": 1.2563, |
|
"step": 19700 |
|
}, |
|
{ |
|
"epoch": 1.1263439331019967, |
|
"grad_norm": 1.6716195344924927, |
|
"learning_rate": 0.0004997933139920739, |
|
"loss": 1.2405, |
|
"step": 19800 |
|
}, |
|
{ |
|
"epoch": 1.132032538824734, |
|
"grad_norm": 1.7629872560501099, |
|
"learning_rate": 0.0004982763524660106, |
|
"loss": 1.2649, |
|
"step": 19900 |
|
}, |
|
{ |
|
"epoch": 1.1377211445474713, |
|
"grad_norm": 1.704967737197876, |
|
"learning_rate": 0.0004967593909399473, |
|
"loss": 1.226, |
|
"step": 20000 |
|
}, |
|
{ |
|
"epoch": 1.1377211445474713, |
|
"eval_accuracy": 0.692396, |
|
"eval_loss": 1.2322564125061035, |
|
"eval_runtime": 82.0826, |
|
"eval_samples_per_second": 3045.712, |
|
"eval_steps_per_second": 11.903, |
|
"step": 20000 |
|
}, |
|
{ |
|
"epoch": 1.1434097502702087, |
|
"grad_norm": 1.5182781219482422, |
|
"learning_rate": 0.000495242429413884, |
|
"loss": 1.2343, |
|
"step": 20100 |
|
}, |
|
{ |
|
"epoch": 1.1490983559929462, |
|
"grad_norm": 2.637796640396118, |
|
"learning_rate": 0.0004937254678878207, |
|
"loss": 1.2506, |
|
"step": 20200 |
|
}, |
|
{ |
|
"epoch": 1.1547869617156834, |
|
"grad_norm": 1.8955748081207275, |
|
"learning_rate": 0.0004922085063617575, |
|
"loss": 1.2625, |
|
"step": 20300 |
|
}, |
|
{ |
|
"epoch": 1.1604755674384208, |
|
"grad_norm": 2.0370640754699707, |
|
"learning_rate": 0.0004906915448356942, |
|
"loss": 1.2551, |
|
"step": 20400 |
|
}, |
|
{ |
|
"epoch": 1.1661641731611583, |
|
"grad_norm": 1.8047020435333252, |
|
"learning_rate": 0.0004891745833096308, |
|
"loss": 1.2489, |
|
"step": 20500 |
|
}, |
|
{ |
|
"epoch": 1.1718527788838955, |
|
"grad_norm": 1.5440089702606201, |
|
"learning_rate": 0.0004876576217835675, |
|
"loss": 1.2646, |
|
"step": 20600 |
|
}, |
|
{ |
|
"epoch": 1.177541384606633, |
|
"grad_norm": 1.5029830932617188, |
|
"learning_rate": 0.00048614066025750425, |
|
"loss": 1.2536, |
|
"step": 20700 |
|
}, |
|
{ |
|
"epoch": 1.1832299903293704, |
|
"grad_norm": 1.4674205780029297, |
|
"learning_rate": 0.0004846236987314409, |
|
"loss": 1.2457, |
|
"step": 20800 |
|
}, |
|
{ |
|
"epoch": 1.1889185960521076, |
|
"grad_norm": 1.5259037017822266, |
|
"learning_rate": 0.00048310673720537765, |
|
"loss": 1.2449, |
|
"step": 20900 |
|
}, |
|
{ |
|
"epoch": 1.194607201774845, |
|
"grad_norm": 1.6339012384414673, |
|
"learning_rate": 0.0004815897756793144, |
|
"loss": 1.2163, |
|
"step": 21000 |
|
}, |
|
{ |
|
"epoch": 1.2002958074975822, |
|
"grad_norm": 1.5565885305404663, |
|
"learning_rate": 0.00048007281415325106, |
|
"loss": 1.2461, |
|
"step": 21100 |
|
}, |
|
{ |
|
"epoch": 1.2059844132203197, |
|
"grad_norm": 1.676540493965149, |
|
"learning_rate": 0.0004785558526271878, |
|
"loss": 1.2555, |
|
"step": 21200 |
|
}, |
|
{ |
|
"epoch": 1.2116730189430571, |
|
"grad_norm": 1.6003342866897583, |
|
"learning_rate": 0.00047705406071638513, |
|
"loss": 1.2507, |
|
"step": 21300 |
|
}, |
|
{ |
|
"epoch": 1.2173616246657943, |
|
"grad_norm": 2.2655630111694336, |
|
"learning_rate": 0.00047553709919032175, |
|
"loss": 1.2144, |
|
"step": 21400 |
|
}, |
|
{ |
|
"epoch": 1.2230502303885318, |
|
"grad_norm": 1.695094108581543, |
|
"learning_rate": 0.0004740201376642585, |
|
"loss": 1.2415, |
|
"step": 21500 |
|
}, |
|
{ |
|
"epoch": 1.2287388361112692, |
|
"grad_norm": 1.8387731313705444, |
|
"learning_rate": 0.0004725031761381952, |
|
"loss": 1.2406, |
|
"step": 21600 |
|
}, |
|
{ |
|
"epoch": 1.2344274418340064, |
|
"grad_norm": 1.6776598691940308, |
|
"learning_rate": 0.0004709862146121319, |
|
"loss": 1.2673, |
|
"step": 21700 |
|
}, |
|
{ |
|
"epoch": 1.2401160475567439, |
|
"grad_norm": 1.6573506593704224, |
|
"learning_rate": 0.0004694692530860686, |
|
"loss": 1.2587, |
|
"step": 21800 |
|
}, |
|
{ |
|
"epoch": 1.2458046532794813, |
|
"grad_norm": 1.6786317825317383, |
|
"learning_rate": 0.00046795229156000535, |
|
"loss": 1.2464, |
|
"step": 21900 |
|
}, |
|
{ |
|
"epoch": 1.2514932590022185, |
|
"grad_norm": 1.887971043586731, |
|
"learning_rate": 0.00046643533003394203, |
|
"loss": 1.2501, |
|
"step": 22000 |
|
}, |
|
{ |
|
"epoch": 1.257181864724956, |
|
"grad_norm": 1.7499343156814575, |
|
"learning_rate": 0.00046491836850787876, |
|
"loss": 1.2296, |
|
"step": 22100 |
|
}, |
|
{ |
|
"epoch": 1.2628704704476932, |
|
"grad_norm": 2.057670831680298, |
|
"learning_rate": 0.00046340140698181544, |
|
"loss": 1.2346, |
|
"step": 22200 |
|
}, |
|
{ |
|
"epoch": 1.2685590761704306, |
|
"grad_norm": 1.7353135347366333, |
|
"learning_rate": 0.00046188444545575217, |
|
"loss": 1.2512, |
|
"step": 22300 |
|
}, |
|
{ |
|
"epoch": 1.274247681893168, |
|
"grad_norm": 2.0662734508514404, |
|
"learning_rate": 0.0004603674839296889, |
|
"loss": 1.2493, |
|
"step": 22400 |
|
}, |
|
{ |
|
"epoch": 1.2799362876159053, |
|
"grad_norm": 1.5519914627075195, |
|
"learning_rate": 0.0004588505224036255, |
|
"loss": 1.2404, |
|
"step": 22500 |
|
}, |
|
{ |
|
"epoch": 1.2856248933386427, |
|
"grad_norm": 1.8667906522750854, |
|
"learning_rate": 0.0004573335608775622, |
|
"loss": 1.247, |
|
"step": 22600 |
|
}, |
|
{ |
|
"epoch": 1.29131349906138, |
|
"grad_norm": 1.8621453046798706, |
|
"learning_rate": 0.00045581659935149893, |
|
"loss": 1.2428, |
|
"step": 22700 |
|
}, |
|
{ |
|
"epoch": 1.2970021047841174, |
|
"grad_norm": 1.7203937768936157, |
|
"learning_rate": 0.00045429963782543566, |
|
"loss": 1.2273, |
|
"step": 22800 |
|
}, |
|
{ |
|
"epoch": 1.3026907105068548, |
|
"grad_norm": 1.7497667074203491, |
|
"learning_rate": 0.00045278267629937234, |
|
"loss": 1.2458, |
|
"step": 22900 |
|
}, |
|
{ |
|
"epoch": 1.3083793162295922, |
|
"grad_norm": 2.057507276535034, |
|
"learning_rate": 0.0004512657147733091, |
|
"loss": 1.2325, |
|
"step": 23000 |
|
}, |
|
{ |
|
"epoch": 1.3140679219523295, |
|
"grad_norm": 1.4594337940216064, |
|
"learning_rate": 0.0004497487532472458, |
|
"loss": 1.2319, |
|
"step": 23100 |
|
}, |
|
{ |
|
"epoch": 1.319756527675067, |
|
"grad_norm": 2.1696736812591553, |
|
"learning_rate": 0.0004482317917211825, |
|
"loss": 1.234, |
|
"step": 23200 |
|
}, |
|
{ |
|
"epoch": 1.3254451333978041, |
|
"grad_norm": 1.8165019750595093, |
|
"learning_rate": 0.0004467148301951192, |
|
"loss": 1.2256, |
|
"step": 23300 |
|
}, |
|
{ |
|
"epoch": 1.3311337391205416, |
|
"grad_norm": 1.5531728267669678, |
|
"learning_rate": 0.00044519786866905594, |
|
"loss": 1.2518, |
|
"step": 23400 |
|
}, |
|
{ |
|
"epoch": 1.336822344843279, |
|
"grad_norm": 1.4592831134796143, |
|
"learning_rate": 0.0004436809071429926, |
|
"loss": 1.2192, |
|
"step": 23500 |
|
}, |
|
{ |
|
"epoch": 1.3425109505660162, |
|
"grad_norm": 1.74478280544281, |
|
"learning_rate": 0.00044216394561692935, |
|
"loss": 1.2427, |
|
"step": 23600 |
|
}, |
|
{ |
|
"epoch": 1.3481995562887537, |
|
"grad_norm": 1.8685113191604614, |
|
"learning_rate": 0.000440646984090866, |
|
"loss": 1.2581, |
|
"step": 23700 |
|
}, |
|
{ |
|
"epoch": 1.3538881620114909, |
|
"grad_norm": 1.7366535663604736, |
|
"learning_rate": 0.0004391300225648027, |
|
"loss": 1.2471, |
|
"step": 23800 |
|
}, |
|
{ |
|
"epoch": 1.3595767677342283, |
|
"grad_norm": 1.6585444211959839, |
|
"learning_rate": 0.0004376130610387394, |
|
"loss": 1.2198, |
|
"step": 23900 |
|
}, |
|
{ |
|
"epoch": 1.3652653734569657, |
|
"grad_norm": 1.9299806356430054, |
|
"learning_rate": 0.0004360960995126761, |
|
"loss": 1.2314, |
|
"step": 24000 |
|
}, |
|
{ |
|
"epoch": 1.3709539791797032, |
|
"grad_norm": 1.8172481060028076, |
|
"learning_rate": 0.00043457913798661285, |
|
"loss": 1.2098, |
|
"step": 24100 |
|
}, |
|
{ |
|
"epoch": 1.3766425849024404, |
|
"grad_norm": 1.5579493045806885, |
|
"learning_rate": 0.0004330621764605495, |
|
"loss": 1.2043, |
|
"step": 24200 |
|
}, |
|
{ |
|
"epoch": 1.3823311906251778, |
|
"grad_norm": 1.8178203105926514, |
|
"learning_rate": 0.00043154521493448625, |
|
"loss": 1.2235, |
|
"step": 24300 |
|
}, |
|
{ |
|
"epoch": 1.388019796347915, |
|
"grad_norm": 1.676126480102539, |
|
"learning_rate": 0.000430028253408423, |
|
"loss": 1.2388, |
|
"step": 24400 |
|
}, |
|
{ |
|
"epoch": 1.3937084020706525, |
|
"grad_norm": 1.863893985748291, |
|
"learning_rate": 0.00042851129188235966, |
|
"loss": 1.2206, |
|
"step": 24500 |
|
}, |
|
{ |
|
"epoch": 1.39939700779339, |
|
"grad_norm": 1.5618318319320679, |
|
"learning_rate": 0.0004269943303562964, |
|
"loss": 1.2359, |
|
"step": 24600 |
|
}, |
|
{ |
|
"epoch": 1.4050856135161272, |
|
"grad_norm": 1.3972681760787964, |
|
"learning_rate": 0.0004254773688302331, |
|
"loss": 1.2152, |
|
"step": 24700 |
|
}, |
|
{ |
|
"epoch": 1.4107742192388646, |
|
"grad_norm": 1.584274411201477, |
|
"learning_rate": 0.00042396040730416975, |
|
"loss": 1.211, |
|
"step": 24800 |
|
}, |
|
{ |
|
"epoch": 1.4164628249616018, |
|
"grad_norm": 1.7282276153564453, |
|
"learning_rate": 0.0004224434457781064, |
|
"loss": 1.2034, |
|
"step": 24900 |
|
}, |
|
{ |
|
"epoch": 1.4221514306843392, |
|
"grad_norm": 2.2420654296875, |
|
"learning_rate": 0.00042092648425204316, |
|
"loss": 1.2125, |
|
"step": 25000 |
|
}, |
|
{ |
|
"epoch": 1.4221514306843392, |
|
"eval_accuracy": 0.703072, |
|
"eval_loss": 1.185011863708496, |
|
"eval_runtime": 81.7158, |
|
"eval_samples_per_second": 3059.385, |
|
"eval_steps_per_second": 11.956, |
|
"step": 25000 |
|
}, |
|
{ |
|
"epoch": 1.4278400364070767, |
|
"grad_norm": 1.5998648405075073, |
|
"learning_rate": 0.0004194095227259799, |
|
"loss": 1.2145, |
|
"step": 25100 |
|
}, |
|
{ |
|
"epoch": 1.433528642129814, |
|
"grad_norm": 1.8546173572540283, |
|
"learning_rate": 0.00041789256119991656, |
|
"loss": 1.2293, |
|
"step": 25200 |
|
}, |
|
{ |
|
"epoch": 1.4392172478525513, |
|
"grad_norm": 1.6815022230148315, |
|
"learning_rate": 0.00041639076928911395, |
|
"loss": 1.2165, |
|
"step": 25300 |
|
}, |
|
{ |
|
"epoch": 1.4449058535752888, |
|
"grad_norm": 1.5567988157272339, |
|
"learning_rate": 0.0004148889773783113, |
|
"loss": 1.2231, |
|
"step": 25400 |
|
}, |
|
{ |
|
"epoch": 1.450594459298026, |
|
"grad_norm": 1.9424728155136108, |
|
"learning_rate": 0.000413372015852248, |
|
"loss": 1.2302, |
|
"step": 25500 |
|
}, |
|
{ |
|
"epoch": 1.4562830650207634, |
|
"grad_norm": 1.9052510261535645, |
|
"learning_rate": 0.00041185505432618464, |
|
"loss": 1.2174, |
|
"step": 25600 |
|
}, |
|
{ |
|
"epoch": 1.4619716707435009, |
|
"grad_norm": 1.470513939857483, |
|
"learning_rate": 0.0004103380928001213, |
|
"loss": 1.2167, |
|
"step": 25700 |
|
}, |
|
{ |
|
"epoch": 1.467660276466238, |
|
"grad_norm": 1.5082899332046509, |
|
"learning_rate": 0.00040882113127405805, |
|
"loss": 1.2098, |
|
"step": 25800 |
|
}, |
|
{ |
|
"epoch": 1.4733488821889755, |
|
"grad_norm": 1.7309447526931763, |
|
"learning_rate": 0.0004073041697479948, |
|
"loss": 1.2111, |
|
"step": 25900 |
|
}, |
|
{ |
|
"epoch": 1.4790374879117127, |
|
"grad_norm": 1.7894470691680908, |
|
"learning_rate": 0.00040578720822193146, |
|
"loss": 1.2237, |
|
"step": 26000 |
|
}, |
|
{ |
|
"epoch": 1.4847260936344502, |
|
"grad_norm": 1.712557077407837, |
|
"learning_rate": 0.0004042702466958682, |
|
"loss": 1.2314, |
|
"step": 26100 |
|
}, |
|
{ |
|
"epoch": 1.4904146993571876, |
|
"grad_norm": 1.794565200805664, |
|
"learning_rate": 0.0004027532851698049, |
|
"loss": 1.2072, |
|
"step": 26200 |
|
}, |
|
{ |
|
"epoch": 1.4961033050799248, |
|
"grad_norm": 1.655179500579834, |
|
"learning_rate": 0.0004012363236437416, |
|
"loss": 1.218, |
|
"step": 26300 |
|
}, |
|
{ |
|
"epoch": 1.5017919108026623, |
|
"grad_norm": 1.8749343156814575, |
|
"learning_rate": 0.00039971936211767833, |
|
"loss": 1.2037, |
|
"step": 26400 |
|
}, |
|
{ |
|
"epoch": 1.5074805165253995, |
|
"grad_norm": 1.5337320566177368, |
|
"learning_rate": 0.000398202400591615, |
|
"loss": 1.2179, |
|
"step": 26500 |
|
}, |
|
{ |
|
"epoch": 1.513169122248137, |
|
"grad_norm": 1.5731686353683472, |
|
"learning_rate": 0.0003966854390655517, |
|
"loss": 1.2037, |
|
"step": 26600 |
|
}, |
|
{ |
|
"epoch": 1.5188577279708744, |
|
"grad_norm": 1.5700329542160034, |
|
"learning_rate": 0.0003951684775394884, |
|
"loss": 1.2189, |
|
"step": 26700 |
|
}, |
|
{ |
|
"epoch": 1.5245463336936118, |
|
"grad_norm": 1.9315118789672852, |
|
"learning_rate": 0.00039365151601342515, |
|
"loss": 1.2314, |
|
"step": 26800 |
|
}, |
|
{ |
|
"epoch": 1.530234939416349, |
|
"grad_norm": 1.6017844676971436, |
|
"learning_rate": 0.0003921345544873618, |
|
"loss": 1.211, |
|
"step": 26900 |
|
}, |
|
{ |
|
"epoch": 1.5359235451390862, |
|
"grad_norm": 1.586595058441162, |
|
"learning_rate": 0.0003906175929612985, |
|
"loss": 1.2079, |
|
"step": 27000 |
|
}, |
|
{ |
|
"epoch": 1.5416121508618237, |
|
"grad_norm": 1.8215593099594116, |
|
"learning_rate": 0.00038910063143523523, |
|
"loss": 1.2022, |
|
"step": 27100 |
|
}, |
|
{ |
|
"epoch": 1.5473007565845611, |
|
"grad_norm": 1.7390124797821045, |
|
"learning_rate": 0.00038758366990917196, |
|
"loss": 1.2143, |
|
"step": 27200 |
|
}, |
|
{ |
|
"epoch": 1.5529893623072986, |
|
"grad_norm": 1.792608618736267, |
|
"learning_rate": 0.00038606670838310864, |
|
"loss": 1.2104, |
|
"step": 27300 |
|
}, |
|
{ |
|
"epoch": 1.558677968030036, |
|
"grad_norm": 1.802167296409607, |
|
"learning_rate": 0.00038454974685704537, |
|
"loss": 1.1924, |
|
"step": 27400 |
|
}, |
|
{ |
|
"epoch": 1.5643665737527732, |
|
"grad_norm": 1.7943332195281982, |
|
"learning_rate": 0.0003830327853309821, |
|
"loss": 1.2096, |
|
"step": 27500 |
|
}, |
|
{ |
|
"epoch": 1.5700551794755104, |
|
"grad_norm": 1.745893120765686, |
|
"learning_rate": 0.0003815158238049187, |
|
"loss": 1.1941, |
|
"step": 27600 |
|
}, |
|
{ |
|
"epoch": 1.5757437851982479, |
|
"grad_norm": 1.6740118265151978, |
|
"learning_rate": 0.00037999886227885546, |
|
"loss": 1.2355, |
|
"step": 27700 |
|
}, |
|
{ |
|
"epoch": 1.5814323909209853, |
|
"grad_norm": 1.681840419769287, |
|
"learning_rate": 0.0003784970703680528, |
|
"loss": 1.2034, |
|
"step": 27800 |
|
}, |
|
{ |
|
"epoch": 1.5871209966437227, |
|
"grad_norm": 1.6897751092910767, |
|
"learning_rate": 0.0003769801088419895, |
|
"loss": 1.2169, |
|
"step": 27900 |
|
}, |
|
{ |
|
"epoch": 1.59280960236646, |
|
"grad_norm": 1.686784267425537, |
|
"learning_rate": 0.0003754631473159262, |
|
"loss": 1.2093, |
|
"step": 28000 |
|
}, |
|
{ |
|
"epoch": 1.5984982080891972, |
|
"grad_norm": 1.6020421981811523, |
|
"learning_rate": 0.00037394618578986293, |
|
"loss": 1.2131, |
|
"step": 28100 |
|
}, |
|
{ |
|
"epoch": 1.6041868138119346, |
|
"grad_norm": 1.478246808052063, |
|
"learning_rate": 0.0003724292242637996, |
|
"loss": 1.2048, |
|
"step": 28200 |
|
}, |
|
{ |
|
"epoch": 1.609875419534672, |
|
"grad_norm": 1.4912410974502563, |
|
"learning_rate": 0.00037091226273773634, |
|
"loss": 1.18, |
|
"step": 28300 |
|
}, |
|
{ |
|
"epoch": 1.6155640252574095, |
|
"grad_norm": 1.6362539529800415, |
|
"learning_rate": 0.00036939530121167307, |
|
"loss": 1.2022, |
|
"step": 28400 |
|
}, |
|
{ |
|
"epoch": 1.6212526309801467, |
|
"grad_norm": 1.5238479375839233, |
|
"learning_rate": 0.00036787833968560975, |
|
"loss": 1.2105, |
|
"step": 28500 |
|
}, |
|
{ |
|
"epoch": 1.6269412367028842, |
|
"grad_norm": 1.6359635591506958, |
|
"learning_rate": 0.0003663613781595464, |
|
"loss": 1.1833, |
|
"step": 28600 |
|
}, |
|
{ |
|
"epoch": 1.6326298424256214, |
|
"grad_norm": 1.6206257343292236, |
|
"learning_rate": 0.00036484441663348316, |
|
"loss": 1.1945, |
|
"step": 28700 |
|
}, |
|
{ |
|
"epoch": 1.6383184481483588, |
|
"grad_norm": 1.7032015323638916, |
|
"learning_rate": 0.00036332745510741983, |
|
"loss": 1.2065, |
|
"step": 28800 |
|
}, |
|
{ |
|
"epoch": 1.6440070538710962, |
|
"grad_norm": 1.7177228927612305, |
|
"learning_rate": 0.00036181049358135657, |
|
"loss": 1.2035, |
|
"step": 28900 |
|
}, |
|
{ |
|
"epoch": 1.6496956595938337, |
|
"grad_norm": 1.5967752933502197, |
|
"learning_rate": 0.0003602935320552933, |
|
"loss": 1.2036, |
|
"step": 29000 |
|
}, |
|
{ |
|
"epoch": 1.655384265316571, |
|
"grad_norm": 1.6632803678512573, |
|
"learning_rate": 0.00035877657052923, |
|
"loss": 1.226, |
|
"step": 29100 |
|
}, |
|
{ |
|
"epoch": 1.6610728710393081, |
|
"grad_norm": 1.5134357213974, |
|
"learning_rate": 0.00035725960900316665, |
|
"loss": 1.1947, |
|
"step": 29200 |
|
}, |
|
{ |
|
"epoch": 1.6667614767620456, |
|
"grad_norm": 1.5506322383880615, |
|
"learning_rate": 0.0003557426474771034, |
|
"loss": 1.1963, |
|
"step": 29300 |
|
}, |
|
{ |
|
"epoch": 1.672450082484783, |
|
"grad_norm": 1.4821183681488037, |
|
"learning_rate": 0.0003542256859510401, |
|
"loss": 1.2039, |
|
"step": 29400 |
|
}, |
|
{ |
|
"epoch": 1.6781386882075204, |
|
"grad_norm": 2.278379440307617, |
|
"learning_rate": 0.0003527087244249768, |
|
"loss": 1.205, |
|
"step": 29500 |
|
}, |
|
{ |
|
"epoch": 1.6838272939302577, |
|
"grad_norm": 1.5077921152114868, |
|
"learning_rate": 0.0003511917628989135, |
|
"loss": 1.1984, |
|
"step": 29600 |
|
}, |
|
{ |
|
"epoch": 1.689515899652995, |
|
"grad_norm": 1.629607915878296, |
|
"learning_rate": 0.0003496748013728502, |
|
"loss": 1.2023, |
|
"step": 29700 |
|
}, |
|
{ |
|
"epoch": 1.6952045053757323, |
|
"grad_norm": 1.5007668733596802, |
|
"learning_rate": 0.0003481578398467869, |
|
"loss": 1.1907, |
|
"step": 29800 |
|
}, |
|
{ |
|
"epoch": 1.7008931110984697, |
|
"grad_norm": 1.7543882131576538, |
|
"learning_rate": 0.0003466408783207236, |
|
"loss": 1.1949, |
|
"step": 29900 |
|
}, |
|
{ |
|
"epoch": 1.7065817168212072, |
|
"grad_norm": 1.6254594326019287, |
|
"learning_rate": 0.00034512391679466034, |
|
"loss": 1.1912, |
|
"step": 30000 |
|
}, |
|
{ |
|
"epoch": 1.7065817168212072, |
|
"eval_accuracy": 0.709248, |
|
"eval_loss": 1.1566522121429443, |
|
"eval_runtime": 79.5399, |
|
"eval_samples_per_second": 3143.077, |
|
"eval_steps_per_second": 12.283, |
|
"step": 30000 |
|
}, |
|
{ |
|
"epoch": 1.7122703225439446, |
|
"grad_norm": 1.873049020767212, |
|
"learning_rate": 0.000343606955268597, |
|
"loss": 1.2063, |
|
"step": 30100 |
|
}, |
|
{ |
|
"epoch": 1.7179589282666818, |
|
"grad_norm": 1.5862141847610474, |
|
"learning_rate": 0.00034208999374253375, |
|
"loss": 1.1926, |
|
"step": 30200 |
|
}, |
|
{ |
|
"epoch": 1.723647533989419, |
|
"grad_norm": 1.9915696382522583, |
|
"learning_rate": 0.0003405730322164704, |
|
"loss": 1.1952, |
|
"step": 30300 |
|
}, |
|
{ |
|
"epoch": 1.7293361397121565, |
|
"grad_norm": 1.856048822402954, |
|
"learning_rate": 0.0003390560706904071, |
|
"loss": 1.1953, |
|
"step": 30400 |
|
}, |
|
{ |
|
"epoch": 1.735024745434894, |
|
"grad_norm": 1.6758267879486084, |
|
"learning_rate": 0.00033753910916434383, |
|
"loss": 1.1906, |
|
"step": 30500 |
|
}, |
|
{ |
|
"epoch": 1.7407133511576314, |
|
"grad_norm": 1.8683140277862549, |
|
"learning_rate": 0.00033602214763828056, |
|
"loss": 1.2025, |
|
"step": 30600 |
|
}, |
|
{ |
|
"epoch": 1.7464019568803686, |
|
"grad_norm": 1.452721118927002, |
|
"learning_rate": 0.00033450518611221724, |
|
"loss": 1.1866, |
|
"step": 30700 |
|
}, |
|
{ |
|
"epoch": 1.752090562603106, |
|
"grad_norm": 1.5711089372634888, |
|
"learning_rate": 0.0003329882245861539, |
|
"loss": 1.1856, |
|
"step": 30800 |
|
}, |
|
{ |
|
"epoch": 1.7577791683258432, |
|
"grad_norm": 2.0584185123443604, |
|
"learning_rate": 0.00033147126306009065, |
|
"loss": 1.1873, |
|
"step": 30900 |
|
}, |
|
{ |
|
"epoch": 1.7634677740485807, |
|
"grad_norm": 1.5743275880813599, |
|
"learning_rate": 0.0003299543015340274, |
|
"loss": 1.1988, |
|
"step": 31000 |
|
}, |
|
{ |
|
"epoch": 1.7691563797713181, |
|
"grad_norm": 1.5788936614990234, |
|
"learning_rate": 0.00032843734000796406, |
|
"loss": 1.1932, |
|
"step": 31100 |
|
}, |
|
{ |
|
"epoch": 1.7748449854940556, |
|
"grad_norm": 1.6406651735305786, |
|
"learning_rate": 0.0003269203784819008, |
|
"loss": 1.1876, |
|
"step": 31200 |
|
}, |
|
{ |
|
"epoch": 1.7805335912167928, |
|
"grad_norm": 1.6410019397735596, |
|
"learning_rate": 0.00032540341695583747, |
|
"loss": 1.1859, |
|
"step": 31300 |
|
}, |
|
{ |
|
"epoch": 1.78622219693953, |
|
"grad_norm": 1.548140287399292, |
|
"learning_rate": 0.00032388645542977414, |
|
"loss": 1.2032, |
|
"step": 31400 |
|
}, |
|
{ |
|
"epoch": 1.7919108026622674, |
|
"grad_norm": 1.9242947101593018, |
|
"learning_rate": 0.0003223694939037109, |
|
"loss": 1.199, |
|
"step": 31500 |
|
}, |
|
{ |
|
"epoch": 1.7975994083850049, |
|
"grad_norm": 2.0189428329467773, |
|
"learning_rate": 0.0003208525323776476, |
|
"loss": 1.1832, |
|
"step": 31600 |
|
}, |
|
{ |
|
"epoch": 1.8032880141077423, |
|
"grad_norm": 1.740432620048523, |
|
"learning_rate": 0.0003193355708515843, |
|
"loss": 1.183, |
|
"step": 31700 |
|
}, |
|
{ |
|
"epoch": 1.8089766198304795, |
|
"grad_norm": 1.743503451347351, |
|
"learning_rate": 0.0003178337789407816, |
|
"loss": 1.1991, |
|
"step": 31800 |
|
}, |
|
{ |
|
"epoch": 1.8146652255532167, |
|
"grad_norm": 1.7166736125946045, |
|
"learning_rate": 0.00031631681741471835, |
|
"loss": 1.2031, |
|
"step": 31900 |
|
}, |
|
{ |
|
"epoch": 1.8203538312759542, |
|
"grad_norm": 1.626386046409607, |
|
"learning_rate": 0.000314799855888655, |
|
"loss": 1.1987, |
|
"step": 32000 |
|
}, |
|
{ |
|
"epoch": 1.8260424369986916, |
|
"grad_norm": 1.5402131080627441, |
|
"learning_rate": 0.00031328289436259176, |
|
"loss": 1.172, |
|
"step": 32100 |
|
}, |
|
{ |
|
"epoch": 1.831731042721429, |
|
"grad_norm": 1.6522256135940552, |
|
"learning_rate": 0.0003117659328365285, |
|
"loss": 1.179, |
|
"step": 32200 |
|
}, |
|
{ |
|
"epoch": 1.8374196484441665, |
|
"grad_norm": 1.482009768486023, |
|
"learning_rate": 0.00031024897131046517, |
|
"loss": 1.1903, |
|
"step": 32300 |
|
}, |
|
{ |
|
"epoch": 1.8431082541669037, |
|
"grad_norm": 1.6417380571365356, |
|
"learning_rate": 0.00030873200978440184, |
|
"loss": 1.2024, |
|
"step": 32400 |
|
}, |
|
{ |
|
"epoch": 1.848796859889641, |
|
"grad_norm": 1.532333493232727, |
|
"learning_rate": 0.0003072150482583386, |
|
"loss": 1.1843, |
|
"step": 32500 |
|
}, |
|
{ |
|
"epoch": 1.8544854656123784, |
|
"grad_norm": 2.004293441772461, |
|
"learning_rate": 0.00030569808673227525, |
|
"loss": 1.192, |
|
"step": 32600 |
|
}, |
|
{ |
|
"epoch": 1.8601740713351158, |
|
"grad_norm": 1.7226125001907349, |
|
"learning_rate": 0.000304181125206212, |
|
"loss": 1.1902, |
|
"step": 32700 |
|
}, |
|
{ |
|
"epoch": 1.8658626770578532, |
|
"grad_norm": 1.7714165449142456, |
|
"learning_rate": 0.0003026641636801487, |
|
"loss": 1.1908, |
|
"step": 32800 |
|
}, |
|
{ |
|
"epoch": 1.8715512827805905, |
|
"grad_norm": 1.5100337266921997, |
|
"learning_rate": 0.00030114720215408534, |
|
"loss": 1.1735, |
|
"step": 32900 |
|
}, |
|
{ |
|
"epoch": 1.8772398885033277, |
|
"grad_norm": 1.6792744398117065, |
|
"learning_rate": 0.00029963024062802207, |
|
"loss": 1.191, |
|
"step": 33000 |
|
}, |
|
{ |
|
"epoch": 1.8829284942260651, |
|
"grad_norm": 1.705554723739624, |
|
"learning_rate": 0.0002981132791019588, |
|
"loss": 1.1878, |
|
"step": 33100 |
|
}, |
|
{ |
|
"epoch": 1.8886170999488026, |
|
"grad_norm": 1.4528917074203491, |
|
"learning_rate": 0.0002965963175758955, |
|
"loss": 1.1685, |
|
"step": 33200 |
|
}, |
|
{ |
|
"epoch": 1.89430570567154, |
|
"grad_norm": 1.7752711772918701, |
|
"learning_rate": 0.0002950793560498322, |
|
"loss": 1.1743, |
|
"step": 33300 |
|
}, |
|
{ |
|
"epoch": 1.8999943113942772, |
|
"grad_norm": 1.762074589729309, |
|
"learning_rate": 0.00029356239452376894, |
|
"loss": 1.1775, |
|
"step": 33400 |
|
}, |
|
{ |
|
"epoch": 1.9056829171170147, |
|
"grad_norm": 1.6388828754425049, |
|
"learning_rate": 0.0002920454329977056, |
|
"loss": 1.1762, |
|
"step": 33500 |
|
}, |
|
{ |
|
"epoch": 1.9113715228397519, |
|
"grad_norm": 1.5171791315078735, |
|
"learning_rate": 0.0002905284714716423, |
|
"loss": 1.1649, |
|
"step": 33600 |
|
}, |
|
{ |
|
"epoch": 1.9170601285624893, |
|
"grad_norm": 1.6547460556030273, |
|
"learning_rate": 0.000289011509945579, |
|
"loss": 1.1904, |
|
"step": 33700 |
|
}, |
|
{ |
|
"epoch": 1.9227487342852267, |
|
"grad_norm": 1.705083966255188, |
|
"learning_rate": 0.00028750971803477636, |
|
"loss": 1.1667, |
|
"step": 33800 |
|
}, |
|
{ |
|
"epoch": 1.9284373400079642, |
|
"grad_norm": 1.731803059577942, |
|
"learning_rate": 0.00028599275650871304, |
|
"loss": 1.1788, |
|
"step": 33900 |
|
}, |
|
{ |
|
"epoch": 1.9341259457307014, |
|
"grad_norm": 2.056766986846924, |
|
"learning_rate": 0.00028447579498264977, |
|
"loss": 1.1878, |
|
"step": 34000 |
|
}, |
|
{ |
|
"epoch": 1.9398145514534386, |
|
"grad_norm": 1.8016914129257202, |
|
"learning_rate": 0.00028295883345658644, |
|
"loss": 1.1632, |
|
"step": 34100 |
|
}, |
|
{ |
|
"epoch": 1.945503157176176, |
|
"grad_norm": 1.7706475257873535, |
|
"learning_rate": 0.0002814418719305232, |
|
"loss": 1.1658, |
|
"step": 34200 |
|
}, |
|
{ |
|
"epoch": 1.9511917628989135, |
|
"grad_norm": 1.8184970617294312, |
|
"learning_rate": 0.0002799249104044599, |
|
"loss": 1.1666, |
|
"step": 34300 |
|
}, |
|
{ |
|
"epoch": 1.956880368621651, |
|
"grad_norm": 1.6529743671417236, |
|
"learning_rate": 0.0002784079488783966, |
|
"loss": 1.1846, |
|
"step": 34400 |
|
}, |
|
{ |
|
"epoch": 1.9625689743443882, |
|
"grad_norm": 1.5860931873321533, |
|
"learning_rate": 0.00027689098735233326, |
|
"loss": 1.1917, |
|
"step": 34500 |
|
}, |
|
{ |
|
"epoch": 1.9682575800671256, |
|
"grad_norm": 1.672756552696228, |
|
"learning_rate": 0.00027537402582627, |
|
"loss": 1.1654, |
|
"step": 34600 |
|
}, |
|
{ |
|
"epoch": 1.9739461857898628, |
|
"grad_norm": 1.7606583833694458, |
|
"learning_rate": 0.0002738570643002067, |
|
"loss": 1.176, |
|
"step": 34700 |
|
}, |
|
{ |
|
"epoch": 1.9796347915126002, |
|
"grad_norm": 1.912277340888977, |
|
"learning_rate": 0.0002723401027741434, |
|
"loss": 1.1695, |
|
"step": 34800 |
|
}, |
|
{ |
|
"epoch": 1.9853233972353377, |
|
"grad_norm": 1.7096484899520874, |
|
"learning_rate": 0.00027082314124808013, |
|
"loss": 1.1669, |
|
"step": 34900 |
|
}, |
|
{ |
|
"epoch": 1.9910120029580751, |
|
"grad_norm": 1.7793241739273071, |
|
"learning_rate": 0.0002693061797220168, |
|
"loss": 1.1902, |
|
"step": 35000 |
|
}, |
|
{ |
|
"epoch": 1.9910120029580751, |
|
"eval_accuracy": 0.71648, |
|
"eval_loss": 1.1297262907028198, |
|
"eval_runtime": 79.5966, |
|
"eval_samples_per_second": 3140.839, |
|
"eval_steps_per_second": 12.274, |
|
"step": 35000 |
|
}, |
|
{ |
|
"epoch": 1.9967006086808123, |
|
"grad_norm": 1.4913907051086426, |
|
"learning_rate": 0.0002677892181959535, |
|
"loss": 1.1607, |
|
"step": 35100 |
|
}, |
|
{ |
|
"epoch": 2.0023892144035496, |
|
"grad_norm": 1.639985203742981, |
|
"learning_rate": 0.0002662874262851509, |
|
"loss": 1.1727, |
|
"step": 35200 |
|
}, |
|
{ |
|
"epoch": 2.008077820126287, |
|
"grad_norm": 1.6419970989227295, |
|
"learning_rate": 0.00026477046475908755, |
|
"loss": 1.1392, |
|
"step": 35300 |
|
}, |
|
{ |
|
"epoch": 2.0137664258490244, |
|
"grad_norm": 1.8132672309875488, |
|
"learning_rate": 0.00026325350323302423, |
|
"loss": 1.1503, |
|
"step": 35400 |
|
}, |
|
{ |
|
"epoch": 2.019455031571762, |
|
"grad_norm": 1.4656819105148315, |
|
"learning_rate": 0.00026173654170696096, |
|
"loss": 1.1565, |
|
"step": 35500 |
|
}, |
|
{ |
|
"epoch": 2.0251436372944993, |
|
"grad_norm": 1.3595716953277588, |
|
"learning_rate": 0.0002602195801808977, |
|
"loss": 1.1526, |
|
"step": 35600 |
|
}, |
|
{ |
|
"epoch": 2.0308322430172363, |
|
"grad_norm": 1.6904360055923462, |
|
"learning_rate": 0.00025870261865483437, |
|
"loss": 1.1448, |
|
"step": 35700 |
|
}, |
|
{ |
|
"epoch": 2.0365208487399737, |
|
"grad_norm": 1.7240209579467773, |
|
"learning_rate": 0.0002571856571287711, |
|
"loss": 1.1424, |
|
"step": 35800 |
|
}, |
|
{ |
|
"epoch": 2.042209454462711, |
|
"grad_norm": 1.5376731157302856, |
|
"learning_rate": 0.00025566869560270783, |
|
"loss": 1.143, |
|
"step": 35900 |
|
}, |
|
{ |
|
"epoch": 2.0478980601854486, |
|
"grad_norm": 1.893202781677246, |
|
"learning_rate": 0.00025415173407664445, |
|
"loss": 1.1519, |
|
"step": 36000 |
|
}, |
|
{ |
|
"epoch": 2.053586665908186, |
|
"grad_norm": 1.9057211875915527, |
|
"learning_rate": 0.0002526347725505812, |
|
"loss": 1.1375, |
|
"step": 36100 |
|
}, |
|
{ |
|
"epoch": 2.059275271630923, |
|
"grad_norm": 1.7818187475204468, |
|
"learning_rate": 0.0002511178110245179, |
|
"loss": 1.1424, |
|
"step": 36200 |
|
}, |
|
{ |
|
"epoch": 2.0649638773536605, |
|
"grad_norm": 1.825323462486267, |
|
"learning_rate": 0.0002496008494984546, |
|
"loss": 1.1196, |
|
"step": 36300 |
|
}, |
|
{ |
|
"epoch": 2.070652483076398, |
|
"grad_norm": 2.0049736499786377, |
|
"learning_rate": 0.0002480838879723913, |
|
"loss": 1.1317, |
|
"step": 36400 |
|
}, |
|
{ |
|
"epoch": 2.0763410887991354, |
|
"grad_norm": 1.599846363067627, |
|
"learning_rate": 0.000246566926446328, |
|
"loss": 1.1573, |
|
"step": 36500 |
|
}, |
|
{ |
|
"epoch": 2.082029694521873, |
|
"grad_norm": 1.5434855222702026, |
|
"learning_rate": 0.00024504996492026473, |
|
"loss": 1.1493, |
|
"step": 36600 |
|
}, |
|
{ |
|
"epoch": 2.0877183002446102, |
|
"grad_norm": 1.6306787729263306, |
|
"learning_rate": 0.0002435330033942014, |
|
"loss": 1.1564, |
|
"step": 36700 |
|
}, |
|
{ |
|
"epoch": 2.0934069059673472, |
|
"grad_norm": 1.6914353370666504, |
|
"learning_rate": 0.00024201604186813814, |
|
"loss": 1.1395, |
|
"step": 36800 |
|
}, |
|
{ |
|
"epoch": 2.0990955116900847, |
|
"grad_norm": 1.6444432735443115, |
|
"learning_rate": 0.00024049908034207485, |
|
"loss": 1.1615, |
|
"step": 36900 |
|
}, |
|
{ |
|
"epoch": 2.104784117412822, |
|
"grad_norm": 1.821244239807129, |
|
"learning_rate": 0.00023898211881601155, |
|
"loss": 1.1429, |
|
"step": 37000 |
|
}, |
|
{ |
|
"epoch": 2.1104727231355596, |
|
"grad_norm": 1.6050491333007812, |
|
"learning_rate": 0.00023746515728994823, |
|
"loss": 1.1376, |
|
"step": 37100 |
|
}, |
|
{ |
|
"epoch": 2.116161328858297, |
|
"grad_norm": 1.7375249862670898, |
|
"learning_rate": 0.00023594819576388493, |
|
"loss": 1.1253, |
|
"step": 37200 |
|
}, |
|
{ |
|
"epoch": 2.121849934581034, |
|
"grad_norm": 2.0717177391052246, |
|
"learning_rate": 0.00023443123423782166, |
|
"loss": 1.1527, |
|
"step": 37300 |
|
}, |
|
{ |
|
"epoch": 2.1275385403037714, |
|
"grad_norm": 1.43324875831604, |
|
"learning_rate": 0.00023291427271175837, |
|
"loss": 1.1475, |
|
"step": 37400 |
|
}, |
|
{ |
|
"epoch": 2.133227146026509, |
|
"grad_norm": 1.448669195175171, |
|
"learning_rate": 0.00023139731118569507, |
|
"loss": 1.1133, |
|
"step": 37500 |
|
}, |
|
{ |
|
"epoch": 2.1389157517492463, |
|
"grad_norm": 1.521912932395935, |
|
"learning_rate": 0.00022988034965963178, |
|
"loss": 1.1292, |
|
"step": 37600 |
|
}, |
|
{ |
|
"epoch": 2.1446043574719837, |
|
"grad_norm": 1.6070728302001953, |
|
"learning_rate": 0.00022836338813356845, |
|
"loss": 1.1384, |
|
"step": 37700 |
|
}, |
|
{ |
|
"epoch": 2.1502929631947207, |
|
"grad_norm": 1.3853884935379028, |
|
"learning_rate": 0.00022684642660750516, |
|
"loss": 1.1344, |
|
"step": 37800 |
|
}, |
|
{ |
|
"epoch": 2.155981568917458, |
|
"grad_norm": 1.569415807723999, |
|
"learning_rate": 0.0002253294650814419, |
|
"loss": 1.1572, |
|
"step": 37900 |
|
}, |
|
{ |
|
"epoch": 2.1616701746401956, |
|
"grad_norm": 1.544966220855713, |
|
"learning_rate": 0.0002238125035553786, |
|
"loss": 1.1378, |
|
"step": 38000 |
|
}, |
|
{ |
|
"epoch": 2.167358780362933, |
|
"grad_norm": 1.6090420484542847, |
|
"learning_rate": 0.0002222955420293153, |
|
"loss": 1.1331, |
|
"step": 38100 |
|
}, |
|
{ |
|
"epoch": 2.1730473860856705, |
|
"grad_norm": 1.542605996131897, |
|
"learning_rate": 0.00022077858050325197, |
|
"loss": 1.1302, |
|
"step": 38200 |
|
}, |
|
{ |
|
"epoch": 2.178735991808408, |
|
"grad_norm": 1.744084119796753, |
|
"learning_rate": 0.00021926161897718868, |
|
"loss": 1.1305, |
|
"step": 38300 |
|
}, |
|
{ |
|
"epoch": 2.184424597531145, |
|
"grad_norm": 1.630118489265442, |
|
"learning_rate": 0.0002177446574511254, |
|
"loss": 1.1294, |
|
"step": 38400 |
|
}, |
|
{ |
|
"epoch": 2.1901132032538824, |
|
"grad_norm": 1.6920104026794434, |
|
"learning_rate": 0.0002162276959250621, |
|
"loss": 1.1337, |
|
"step": 38500 |
|
}, |
|
{ |
|
"epoch": 2.19580180897662, |
|
"grad_norm": 1.654189944267273, |
|
"learning_rate": 0.00021471073439899882, |
|
"loss": 1.1182, |
|
"step": 38600 |
|
}, |
|
{ |
|
"epoch": 2.2014904146993572, |
|
"grad_norm": 1.8575996160507202, |
|
"learning_rate": 0.00021319377287293555, |
|
"loss": 1.1261, |
|
"step": 38700 |
|
}, |
|
{ |
|
"epoch": 2.2071790204220947, |
|
"grad_norm": 1.5796535015106201, |
|
"learning_rate": 0.0002116768113468722, |
|
"loss": 1.1389, |
|
"step": 38800 |
|
}, |
|
{ |
|
"epoch": 2.212867626144832, |
|
"grad_norm": 1.6893657445907593, |
|
"learning_rate": 0.00021015984982080893, |
|
"loss": 1.122, |
|
"step": 38900 |
|
}, |
|
{ |
|
"epoch": 2.218556231867569, |
|
"grad_norm": 1.5983092784881592, |
|
"learning_rate": 0.00020864288829474563, |
|
"loss": 1.1487, |
|
"step": 39000 |
|
}, |
|
{ |
|
"epoch": 2.2242448375903066, |
|
"grad_norm": 1.632049798965454, |
|
"learning_rate": 0.00020712592676868234, |
|
"loss": 1.1476, |
|
"step": 39100 |
|
}, |
|
{ |
|
"epoch": 2.229933443313044, |
|
"grad_norm": 2.039854049682617, |
|
"learning_rate": 0.00020562413485787965, |
|
"loss": 1.1443, |
|
"step": 39200 |
|
}, |
|
{ |
|
"epoch": 2.2356220490357814, |
|
"grad_norm": 1.5673627853393555, |
|
"learning_rate": 0.00020410717333181638, |
|
"loss": 1.1259, |
|
"step": 39300 |
|
}, |
|
{ |
|
"epoch": 2.241310654758519, |
|
"grad_norm": 1.6900497674942017, |
|
"learning_rate": 0.00020259021180575308, |
|
"loss": 1.1356, |
|
"step": 39400 |
|
}, |
|
{ |
|
"epoch": 2.246999260481256, |
|
"grad_norm": 1.8306878805160522, |
|
"learning_rate": 0.00020107325027968979, |
|
"loss": 1.1349, |
|
"step": 39500 |
|
}, |
|
{ |
|
"epoch": 2.2526878662039933, |
|
"grad_norm": 1.620490550994873, |
|
"learning_rate": 0.0001995562887536265, |
|
"loss": 1.1417, |
|
"step": 39600 |
|
}, |
|
{ |
|
"epoch": 2.2583764719267307, |
|
"grad_norm": 1.828751802444458, |
|
"learning_rate": 0.0001980393272275632, |
|
"loss": 1.1302, |
|
"step": 39700 |
|
}, |
|
{ |
|
"epoch": 2.264065077649468, |
|
"grad_norm": 1.4963942766189575, |
|
"learning_rate": 0.0001965223657014999, |
|
"loss": 1.152, |
|
"step": 39800 |
|
}, |
|
{ |
|
"epoch": 2.2697536833722056, |
|
"grad_norm": 2.081669807434082, |
|
"learning_rate": 0.0001950054041754366, |
|
"loss": 1.1385, |
|
"step": 39900 |
|
}, |
|
{ |
|
"epoch": 2.2754422890949426, |
|
"grad_norm": 1.6873656511306763, |
|
"learning_rate": 0.0001934884426493733, |
|
"loss": 1.131, |
|
"step": 40000 |
|
}, |
|
{ |
|
"epoch": 2.2754422890949426, |
|
"eval_accuracy": 0.721316, |
|
"eval_loss": 1.1105972528457642, |
|
"eval_runtime": 80.6985, |
|
"eval_samples_per_second": 3097.949, |
|
"eval_steps_per_second": 12.107, |
|
"step": 40000 |
|
}, |
|
{ |
|
"epoch": 2.28113089481768, |
|
"grad_norm": 1.5599457025527954, |
|
"learning_rate": 0.00019197148112331004, |
|
"loss": 1.1525, |
|
"step": 40100 |
|
}, |
|
{ |
|
"epoch": 2.2868195005404175, |
|
"grad_norm": 1.816628098487854, |
|
"learning_rate": 0.00019045451959724672, |
|
"loss": 1.1295, |
|
"step": 40200 |
|
}, |
|
{ |
|
"epoch": 2.292508106263155, |
|
"grad_norm": 1.5481749773025513, |
|
"learning_rate": 0.00018893755807118342, |
|
"loss": 1.124, |
|
"step": 40300 |
|
}, |
|
{ |
|
"epoch": 2.2981967119858924, |
|
"grad_norm": 1.632873296737671, |
|
"learning_rate": 0.00018742059654512015, |
|
"loss": 1.1217, |
|
"step": 40400 |
|
}, |
|
{ |
|
"epoch": 2.3038853177086294, |
|
"grad_norm": 1.4403363466262817, |
|
"learning_rate": 0.00018590363501905683, |
|
"loss": 1.1315, |
|
"step": 40500 |
|
}, |
|
{ |
|
"epoch": 2.309573923431367, |
|
"grad_norm": 1.6744205951690674, |
|
"learning_rate": 0.00018438667349299353, |
|
"loss": 1.1473, |
|
"step": 40600 |
|
}, |
|
{ |
|
"epoch": 2.3152625291541042, |
|
"grad_norm": 1.5021002292633057, |
|
"learning_rate": 0.00018286971196693026, |
|
"loss": 1.1127, |
|
"step": 40700 |
|
}, |
|
{ |
|
"epoch": 2.3209511348768417, |
|
"grad_norm": 1.689931869506836, |
|
"learning_rate": 0.00018135275044086694, |
|
"loss": 1.1394, |
|
"step": 40800 |
|
}, |
|
{ |
|
"epoch": 2.326639740599579, |
|
"grad_norm": 2.1370577812194824, |
|
"learning_rate": 0.00017983578891480367, |
|
"loss": 1.148, |
|
"step": 40900 |
|
}, |
|
{ |
|
"epoch": 2.3323283463223166, |
|
"grad_norm": 1.9048566818237305, |
|
"learning_rate": 0.00017831882738874038, |
|
"loss": 1.1181, |
|
"step": 41000 |
|
}, |
|
{ |
|
"epoch": 2.338016952045054, |
|
"grad_norm": 1.8328748941421509, |
|
"learning_rate": 0.00017680186586267705, |
|
"loss": 1.1302, |
|
"step": 41100 |
|
}, |
|
{ |
|
"epoch": 2.343705557767791, |
|
"grad_norm": 1.7709869146347046, |
|
"learning_rate": 0.0001753000739518744, |
|
"loss": 1.1369, |
|
"step": 41200 |
|
}, |
|
{ |
|
"epoch": 2.3493941634905284, |
|
"grad_norm": 1.6296570301055908, |
|
"learning_rate": 0.00017378311242581112, |
|
"loss": 1.1302, |
|
"step": 41300 |
|
}, |
|
{ |
|
"epoch": 2.355082769213266, |
|
"grad_norm": 1.6044236421585083, |
|
"learning_rate": 0.0001722661508997478, |
|
"loss": 1.1313, |
|
"step": 41400 |
|
}, |
|
{ |
|
"epoch": 2.3607713749360033, |
|
"grad_norm": 1.4571659564971924, |
|
"learning_rate": 0.00017074918937368453, |
|
"loss": 1.1249, |
|
"step": 41500 |
|
}, |
|
{ |
|
"epoch": 2.3664599806587407, |
|
"grad_norm": 1.7237457036972046, |
|
"learning_rate": 0.00016923222784762123, |
|
"loss": 1.1312, |
|
"step": 41600 |
|
}, |
|
{ |
|
"epoch": 2.3721485863814777, |
|
"grad_norm": 1.552881121635437, |
|
"learning_rate": 0.0001677152663215579, |
|
"loss": 1.1282, |
|
"step": 41700 |
|
}, |
|
{ |
|
"epoch": 2.377837192104215, |
|
"grad_norm": 1.6091784238815308, |
|
"learning_rate": 0.00016619830479549464, |
|
"loss": 1.1236, |
|
"step": 41800 |
|
}, |
|
{ |
|
"epoch": 2.3835257978269526, |
|
"grad_norm": 1.8620885610580444, |
|
"learning_rate": 0.00016468134326943134, |
|
"loss": 1.1469, |
|
"step": 41900 |
|
}, |
|
{ |
|
"epoch": 2.38921440354969, |
|
"grad_norm": 1.717551827430725, |
|
"learning_rate": 0.00016316438174336802, |
|
"loss": 1.121, |
|
"step": 42000 |
|
}, |
|
{ |
|
"epoch": 2.3949030092724275, |
|
"grad_norm": 1.6212184429168701, |
|
"learning_rate": 0.00016164742021730475, |
|
"loss": 1.0997, |
|
"step": 42100 |
|
}, |
|
{ |
|
"epoch": 2.4005916149951645, |
|
"grad_norm": 1.3878498077392578, |
|
"learning_rate": 0.00016013045869124146, |
|
"loss": 1.1362, |
|
"step": 42200 |
|
}, |
|
{ |
|
"epoch": 2.406280220717902, |
|
"grad_norm": 1.6336196660995483, |
|
"learning_rate": 0.00015861349716517816, |
|
"loss": 1.1256, |
|
"step": 42300 |
|
}, |
|
{ |
|
"epoch": 2.4119688264406394, |
|
"grad_norm": 1.7155201435089111, |
|
"learning_rate": 0.00015709653563911486, |
|
"loss": 1.1133, |
|
"step": 42400 |
|
}, |
|
{ |
|
"epoch": 2.417657432163377, |
|
"grad_norm": 1.7675564289093018, |
|
"learning_rate": 0.00015557957411305157, |
|
"loss": 1.1416, |
|
"step": 42500 |
|
}, |
|
{ |
|
"epoch": 2.4233460378861142, |
|
"grad_norm": 1.676527976989746, |
|
"learning_rate": 0.00015406261258698827, |
|
"loss": 1.1378, |
|
"step": 42600 |
|
}, |
|
{ |
|
"epoch": 2.4290346436088512, |
|
"grad_norm": 1.6293052434921265, |
|
"learning_rate": 0.00015254565106092498, |
|
"loss": 1.1177, |
|
"step": 42700 |
|
}, |
|
{ |
|
"epoch": 2.4347232493315887, |
|
"grad_norm": 1.5264780521392822, |
|
"learning_rate": 0.00015102868953486168, |
|
"loss": 1.1063, |
|
"step": 42800 |
|
}, |
|
{ |
|
"epoch": 2.440411855054326, |
|
"grad_norm": 1.6453486680984497, |
|
"learning_rate": 0.00014951172800879839, |
|
"loss": 1.1375, |
|
"step": 42900 |
|
}, |
|
{ |
|
"epoch": 2.4461004607770636, |
|
"grad_norm": 1.692336082458496, |
|
"learning_rate": 0.0001479947664827351, |
|
"loss": 1.1004, |
|
"step": 43000 |
|
}, |
|
{ |
|
"epoch": 2.451789066499801, |
|
"grad_norm": 1.868812084197998, |
|
"learning_rate": 0.0001464778049566718, |
|
"loss": 1.1288, |
|
"step": 43100 |
|
}, |
|
{ |
|
"epoch": 2.4574776722225384, |
|
"grad_norm": 1.7713991403579712, |
|
"learning_rate": 0.0001449608434306085, |
|
"loss": 1.1229, |
|
"step": 43200 |
|
}, |
|
{ |
|
"epoch": 2.4631662779452754, |
|
"grad_norm": 1.6394290924072266, |
|
"learning_rate": 0.00014345905151980583, |
|
"loss": 1.0968, |
|
"step": 43300 |
|
}, |
|
{ |
|
"epoch": 2.468854883668013, |
|
"grad_norm": 1.7240723371505737, |
|
"learning_rate": 0.00014194208999374254, |
|
"loss": 1.1151, |
|
"step": 43400 |
|
}, |
|
{ |
|
"epoch": 2.4745434893907503, |
|
"grad_norm": 1.9284464120864868, |
|
"learning_rate": 0.00014042512846767924, |
|
"loss": 1.1302, |
|
"step": 43500 |
|
}, |
|
{ |
|
"epoch": 2.4802320951134877, |
|
"grad_norm": 1.6855792999267578, |
|
"learning_rate": 0.00013890816694161595, |
|
"loss": 1.1163, |
|
"step": 43600 |
|
}, |
|
{ |
|
"epoch": 2.485920700836225, |
|
"grad_norm": 1.8182587623596191, |
|
"learning_rate": 0.00013739120541555265, |
|
"loss": 1.1172, |
|
"step": 43700 |
|
}, |
|
{ |
|
"epoch": 2.4916093065589626, |
|
"grad_norm": 1.5971157550811768, |
|
"learning_rate": 0.00013587424388948935, |
|
"loss": 1.1071, |
|
"step": 43800 |
|
}, |
|
{ |
|
"epoch": 2.4972979122816996, |
|
"grad_norm": 1.7139756679534912, |
|
"learning_rate": 0.00013435728236342606, |
|
"loss": 1.1239, |
|
"step": 43900 |
|
}, |
|
{ |
|
"epoch": 2.502986518004437, |
|
"grad_norm": 1.7199363708496094, |
|
"learning_rate": 0.00013284032083736276, |
|
"loss": 1.1444, |
|
"step": 44000 |
|
}, |
|
{ |
|
"epoch": 2.5086751237271745, |
|
"grad_norm": 1.7295994758605957, |
|
"learning_rate": 0.00013132335931129947, |
|
"loss": 1.122, |
|
"step": 44100 |
|
}, |
|
{ |
|
"epoch": 2.514363729449912, |
|
"grad_norm": 1.9433492422103882, |
|
"learning_rate": 0.00012980639778523617, |
|
"loss": 1.1209, |
|
"step": 44200 |
|
}, |
|
{ |
|
"epoch": 2.5200523351726494, |
|
"grad_norm": 1.5811411142349243, |
|
"learning_rate": 0.0001282894362591729, |
|
"loss": 1.1084, |
|
"step": 44300 |
|
}, |
|
{ |
|
"epoch": 2.5257409408953864, |
|
"grad_norm": 1.5232020616531372, |
|
"learning_rate": 0.00012677247473310958, |
|
"loss": 1.1372, |
|
"step": 44400 |
|
}, |
|
{ |
|
"epoch": 2.531429546618124, |
|
"grad_norm": 2.6212551593780518, |
|
"learning_rate": 0.00012525551320704628, |
|
"loss": 1.1246, |
|
"step": 44500 |
|
}, |
|
{ |
|
"epoch": 2.5371181523408612, |
|
"grad_norm": 1.4962718486785889, |
|
"learning_rate": 0.00012373855168098301, |
|
"loss": 1.1386, |
|
"step": 44600 |
|
}, |
|
{ |
|
"epoch": 2.5428067580635987, |
|
"grad_norm": 1.7713087797164917, |
|
"learning_rate": 0.0001222215901549197, |
|
"loss": 1.1314, |
|
"step": 44700 |
|
}, |
|
{ |
|
"epoch": 2.548495363786336, |
|
"grad_norm": 1.5493218898773193, |
|
"learning_rate": 0.00012070462862885641, |
|
"loss": 1.1204, |
|
"step": 44800 |
|
}, |
|
{ |
|
"epoch": 2.554183969509073, |
|
"grad_norm": 1.6126313209533691, |
|
"learning_rate": 0.00011918766710279313, |
|
"loss": 1.1283, |
|
"step": 44900 |
|
}, |
|
{ |
|
"epoch": 2.5598725752318106, |
|
"grad_norm": 1.5327433347702026, |
|
"learning_rate": 0.00011767070557672982, |
|
"loss": 1.124, |
|
"step": 45000 |
|
}, |
|
{ |
|
"epoch": 2.5598725752318106, |
|
"eval_accuracy": 0.725824, |
|
"eval_loss": 1.0916061401367188, |
|
"eval_runtime": 80.2149, |
|
"eval_samples_per_second": 3116.626, |
|
"eval_steps_per_second": 12.18, |
|
"step": 45000 |
|
}, |
|
{ |
|
"epoch": 2.565561180954548, |
|
"grad_norm": 1.5026576519012451, |
|
"learning_rate": 0.00011615374405066652, |
|
"loss": 1.1197, |
|
"step": 45100 |
|
}, |
|
{ |
|
"epoch": 2.5712497866772854, |
|
"grad_norm": 1.6989002227783203, |
|
"learning_rate": 0.00011463678252460321, |
|
"loss": 1.1247, |
|
"step": 45200 |
|
}, |
|
{ |
|
"epoch": 2.576938392400023, |
|
"grad_norm": 1.5901920795440674, |
|
"learning_rate": 0.00011313499061380057, |
|
"loss": 1.1352, |
|
"step": 45300 |
|
}, |
|
{ |
|
"epoch": 2.58262699812276, |
|
"grad_norm": 1.4382330179214478, |
|
"learning_rate": 0.00011161802908773727, |
|
"loss": 1.1093, |
|
"step": 45400 |
|
}, |
|
{ |
|
"epoch": 2.5883156038454973, |
|
"grad_norm": 1.8520530462265015, |
|
"learning_rate": 0.00011010106756167397, |
|
"loss": 1.1081, |
|
"step": 45500 |
|
}, |
|
{ |
|
"epoch": 2.5940042095682347, |
|
"grad_norm": 1.8772435188293457, |
|
"learning_rate": 0.00010858410603561066, |
|
"loss": 1.1157, |
|
"step": 45600 |
|
}, |
|
{ |
|
"epoch": 2.599692815290972, |
|
"grad_norm": 1.6013365983963013, |
|
"learning_rate": 0.00010706714450954738, |
|
"loss": 1.1463, |
|
"step": 45700 |
|
}, |
|
{ |
|
"epoch": 2.6053814210137096, |
|
"grad_norm": 1.582515835762024, |
|
"learning_rate": 0.0001055501829834841, |
|
"loss": 1.1135, |
|
"step": 45800 |
|
}, |
|
{ |
|
"epoch": 2.611070026736447, |
|
"grad_norm": 1.3782535791397095, |
|
"learning_rate": 0.00010403322145742079, |
|
"loss": 1.1101, |
|
"step": 45900 |
|
}, |
|
{ |
|
"epoch": 2.6167586324591845, |
|
"grad_norm": 1.465584397315979, |
|
"learning_rate": 0.00010251625993135749, |
|
"loss": 1.1354, |
|
"step": 46000 |
|
}, |
|
{ |
|
"epoch": 2.6224472381819215, |
|
"grad_norm": 1.4038536548614502, |
|
"learning_rate": 0.00010099929840529421, |
|
"loss": 1.1078, |
|
"step": 46100 |
|
}, |
|
{ |
|
"epoch": 2.628135843904659, |
|
"grad_norm": 1.9926286935806274, |
|
"learning_rate": 9.948233687923091e-05, |
|
"loss": 1.1044, |
|
"step": 46200 |
|
}, |
|
{ |
|
"epoch": 2.6338244496273964, |
|
"grad_norm": 1.6215740442276, |
|
"learning_rate": 9.796537535316762e-05, |
|
"loss": 1.1188, |
|
"step": 46300 |
|
}, |
|
{ |
|
"epoch": 2.639513055350134, |
|
"grad_norm": 1.5623165369033813, |
|
"learning_rate": 9.644841382710431e-05, |
|
"loss": 1.1155, |
|
"step": 46400 |
|
}, |
|
{ |
|
"epoch": 2.6452016610728712, |
|
"grad_norm": 1.491926670074463, |
|
"learning_rate": 9.493145230104102e-05, |
|
"loss": 1.1211, |
|
"step": 46500 |
|
}, |
|
{ |
|
"epoch": 2.6508902667956082, |
|
"grad_norm": 1.7084381580352783, |
|
"learning_rate": 9.341449077497773e-05, |
|
"loss": 1.1091, |
|
"step": 46600 |
|
}, |
|
{ |
|
"epoch": 2.6565788725183457, |
|
"grad_norm": 1.5060371160507202, |
|
"learning_rate": 9.189752924891443e-05, |
|
"loss": 1.1198, |
|
"step": 46700 |
|
}, |
|
{ |
|
"epoch": 2.662267478241083, |
|
"grad_norm": 1.7321504354476929, |
|
"learning_rate": 9.038056772285112e-05, |
|
"loss": 1.1157, |
|
"step": 46800 |
|
}, |
|
{ |
|
"epoch": 2.6679560839638206, |
|
"grad_norm": 1.559877634048462, |
|
"learning_rate": 8.886360619678784e-05, |
|
"loss": 1.1035, |
|
"step": 46900 |
|
}, |
|
{ |
|
"epoch": 2.673644689686558, |
|
"grad_norm": 1.8588401079177856, |
|
"learning_rate": 8.734664467072455e-05, |
|
"loss": 1.1288, |
|
"step": 47000 |
|
}, |
|
{ |
|
"epoch": 2.679333295409295, |
|
"grad_norm": 1.751246452331543, |
|
"learning_rate": 8.582968314466125e-05, |
|
"loss": 1.1206, |
|
"step": 47100 |
|
}, |
|
{ |
|
"epoch": 2.6850219011320324, |
|
"grad_norm": 1.7309458255767822, |
|
"learning_rate": 8.431272161859795e-05, |
|
"loss": 1.1089, |
|
"step": 47200 |
|
}, |
|
{ |
|
"epoch": 2.69071050685477, |
|
"grad_norm": 1.8057925701141357, |
|
"learning_rate": 8.281092970779529e-05, |
|
"loss": 1.1244, |
|
"step": 47300 |
|
}, |
|
{ |
|
"epoch": 2.6963991125775073, |
|
"grad_norm": 1.7594059705734253, |
|
"learning_rate": 8.1293968181732e-05, |
|
"loss": 1.1188, |
|
"step": 47400 |
|
}, |
|
{ |
|
"epoch": 2.7020877183002447, |
|
"grad_norm": 1.686438798904419, |
|
"learning_rate": 7.97770066556687e-05, |
|
"loss": 1.1068, |
|
"step": 47500 |
|
}, |
|
{ |
|
"epoch": 2.7077763240229817, |
|
"grad_norm": 1.6962246894836426, |
|
"learning_rate": 7.82600451296054e-05, |
|
"loss": 1.1043, |
|
"step": 47600 |
|
}, |
|
{ |
|
"epoch": 2.713464929745719, |
|
"grad_norm": 1.5946807861328125, |
|
"learning_rate": 7.67430836035421e-05, |
|
"loss": 1.1109, |
|
"step": 47700 |
|
}, |
|
{ |
|
"epoch": 2.7191535354684566, |
|
"grad_norm": 1.4834094047546387, |
|
"learning_rate": 7.522612207747881e-05, |
|
"loss": 1.114, |
|
"step": 47800 |
|
}, |
|
{ |
|
"epoch": 2.724842141191194, |
|
"grad_norm": 1.763058066368103, |
|
"learning_rate": 7.370916055141553e-05, |
|
"loss": 1.1091, |
|
"step": 47900 |
|
}, |
|
{ |
|
"epoch": 2.7305307469139315, |
|
"grad_norm": 1.9240601062774658, |
|
"learning_rate": 7.219219902535223e-05, |
|
"loss": 1.0936, |
|
"step": 48000 |
|
}, |
|
{ |
|
"epoch": 2.7362193526366685, |
|
"grad_norm": 1.4768198728561401, |
|
"learning_rate": 7.067523749928892e-05, |
|
"loss": 1.1158, |
|
"step": 48100 |
|
}, |
|
{ |
|
"epoch": 2.7419079583594064, |
|
"grad_norm": 1.9692409038543701, |
|
"learning_rate": 6.915827597322563e-05, |
|
"loss": 1.1201, |
|
"step": 48200 |
|
}, |
|
{ |
|
"epoch": 2.7475965640821434, |
|
"grad_norm": 1.636785864830017, |
|
"learning_rate": 6.764131444716234e-05, |
|
"loss": 1.1092, |
|
"step": 48300 |
|
}, |
|
{ |
|
"epoch": 2.753285169804881, |
|
"grad_norm": 1.5599926710128784, |
|
"learning_rate": 6.612435292109905e-05, |
|
"loss": 1.0932, |
|
"step": 48400 |
|
}, |
|
{ |
|
"epoch": 2.7589737755276182, |
|
"grad_norm": 1.695862054824829, |
|
"learning_rate": 6.460739139503574e-05, |
|
"loss": 1.1227, |
|
"step": 48500 |
|
}, |
|
{ |
|
"epoch": 2.7646623812503557, |
|
"grad_norm": 1.8806819915771484, |
|
"learning_rate": 6.309042986897246e-05, |
|
"loss": 1.1049, |
|
"step": 48600 |
|
}, |
|
{ |
|
"epoch": 2.770350986973093, |
|
"grad_norm": 1.814792513847351, |
|
"learning_rate": 6.157346834290916e-05, |
|
"loss": 1.1149, |
|
"step": 48700 |
|
}, |
|
{ |
|
"epoch": 2.77603959269583, |
|
"grad_norm": 2.068614959716797, |
|
"learning_rate": 6.005650681684586e-05, |
|
"loss": 1.1181, |
|
"step": 48800 |
|
}, |
|
{ |
|
"epoch": 2.7817281984185676, |
|
"grad_norm": 1.5576444864273071, |
|
"learning_rate": 5.853954529078256e-05, |
|
"loss": 1.1223, |
|
"step": 48900 |
|
}, |
|
{ |
|
"epoch": 2.787416804141305, |
|
"grad_norm": 1.8175384998321533, |
|
"learning_rate": 5.7022583764719273e-05, |
|
"loss": 1.1113, |
|
"step": 49000 |
|
}, |
|
{ |
|
"epoch": 2.7931054098640424, |
|
"grad_norm": 1.570915937423706, |
|
"learning_rate": 5.550562223865598e-05, |
|
"loss": 1.1123, |
|
"step": 49100 |
|
}, |
|
{ |
|
"epoch": 2.79879401558678, |
|
"grad_norm": 1.9663364887237549, |
|
"learning_rate": 5.3988660712592675e-05, |
|
"loss": 1.1065, |
|
"step": 49200 |
|
}, |
|
{ |
|
"epoch": 2.804482621309517, |
|
"grad_norm": 2.2906079292297363, |
|
"learning_rate": 5.248686880179001e-05, |
|
"loss": 1.0993, |
|
"step": 49300 |
|
}, |
|
{ |
|
"epoch": 2.8101712270322543, |
|
"grad_norm": 1.566801905632019, |
|
"learning_rate": 5.096990727572673e-05, |
|
"loss": 1.0964, |
|
"step": 49400 |
|
}, |
|
{ |
|
"epoch": 2.8158598327549917, |
|
"grad_norm": 1.7769867181777954, |
|
"learning_rate": 4.9452945749663425e-05, |
|
"loss": 1.0978, |
|
"step": 49500 |
|
}, |
|
{ |
|
"epoch": 2.821548438477729, |
|
"grad_norm": 1.9856287240982056, |
|
"learning_rate": 4.7935984223600136e-05, |
|
"loss": 1.0875, |
|
"step": 49600 |
|
}, |
|
{ |
|
"epoch": 2.8272370442004666, |
|
"grad_norm": 1.7836079597473145, |
|
"learning_rate": 4.6419022697536834e-05, |
|
"loss": 1.1056, |
|
"step": 49700 |
|
}, |
|
{ |
|
"epoch": 2.8329256499232036, |
|
"grad_norm": 1.9246402978897095, |
|
"learning_rate": 4.4902061171473545e-05, |
|
"loss": 1.1074, |
|
"step": 49800 |
|
}, |
|
{ |
|
"epoch": 2.838614255645941, |
|
"grad_norm": 1.3988184928894043, |
|
"learning_rate": 4.338509964541024e-05, |
|
"loss": 1.1206, |
|
"step": 49900 |
|
}, |
|
{ |
|
"epoch": 2.8443028613686785, |
|
"grad_norm": 1.7193849086761475, |
|
"learning_rate": 4.186813811934695e-05, |
|
"loss": 1.1245, |
|
"step": 50000 |
|
}, |
|
{ |
|
"epoch": 2.8443028613686785, |
|
"eval_accuracy": 0.729988, |
|
"eval_loss": 1.0782374143600464, |
|
"eval_runtime": 82.4205, |
|
"eval_samples_per_second": 3033.226, |
|
"eval_steps_per_second": 11.854, |
|
"step": 50000 |
|
}, |
|
{ |
|
"epoch": 2.849991467091416, |
|
"grad_norm": 1.7059062719345093, |
|
"learning_rate": 4.035117659328366e-05, |
|
"loss": 1.1009, |
|
"step": 50100 |
|
}, |
|
{ |
|
"epoch": 2.8556800728141534, |
|
"grad_norm": 1.4554681777954102, |
|
"learning_rate": 3.883421506722036e-05, |
|
"loss": 1.1108, |
|
"step": 50200 |
|
}, |
|
{ |
|
"epoch": 2.8613686785368904, |
|
"grad_norm": 1.7067590951919556, |
|
"learning_rate": 3.7317253541157065e-05, |
|
"loss": 1.0956, |
|
"step": 50300 |
|
}, |
|
{ |
|
"epoch": 2.867057284259628, |
|
"grad_norm": 1.7176940441131592, |
|
"learning_rate": 3.580029201509377e-05, |
|
"loss": 1.0841, |
|
"step": 50400 |
|
}, |
|
{ |
|
"epoch": 2.8727458899823652, |
|
"grad_norm": 1.7251313924789429, |
|
"learning_rate": 3.4283330489030474e-05, |
|
"loss": 1.0908, |
|
"step": 50500 |
|
}, |
|
{ |
|
"epoch": 2.8784344957051027, |
|
"grad_norm": 1.668372631072998, |
|
"learning_rate": 3.276636896296718e-05, |
|
"loss": 1.0954, |
|
"step": 50600 |
|
}, |
|
{ |
|
"epoch": 2.88412310142784, |
|
"grad_norm": 1.889109492301941, |
|
"learning_rate": 3.124940743690388e-05, |
|
"loss": 1.1096, |
|
"step": 50700 |
|
}, |
|
{ |
|
"epoch": 2.8898117071505776, |
|
"grad_norm": 1.509391188621521, |
|
"learning_rate": 2.973244591084059e-05, |
|
"loss": 1.09, |
|
"step": 50800 |
|
}, |
|
{ |
|
"epoch": 2.895500312873315, |
|
"grad_norm": 2.024489402770996, |
|
"learning_rate": 2.821548438477729e-05, |
|
"loss": 1.0993, |
|
"step": 50900 |
|
}, |
|
{ |
|
"epoch": 2.901188918596052, |
|
"grad_norm": 2.007756471633911, |
|
"learning_rate": 2.6698522858713998e-05, |
|
"loss": 1.1029, |
|
"step": 51000 |
|
}, |
|
{ |
|
"epoch": 2.9068775243187894, |
|
"grad_norm": 1.5296841859817505, |
|
"learning_rate": 2.51815613326507e-05, |
|
"loss": 1.095, |
|
"step": 51100 |
|
}, |
|
{ |
|
"epoch": 2.912566130041527, |
|
"grad_norm": 1.6109613180160522, |
|
"learning_rate": 2.3664599806587406e-05, |
|
"loss": 1.1044, |
|
"step": 51200 |
|
}, |
|
{ |
|
"epoch": 2.9182547357642643, |
|
"grad_norm": 1.8067957162857056, |
|
"learning_rate": 2.216280789578474e-05, |
|
"loss": 1.1074, |
|
"step": 51300 |
|
}, |
|
{ |
|
"epoch": 2.9239433414870017, |
|
"grad_norm": 1.6997472047805786, |
|
"learning_rate": 2.064584636972145e-05, |
|
"loss": 1.1036, |
|
"step": 51400 |
|
}, |
|
{ |
|
"epoch": 2.9296319472097387, |
|
"grad_norm": 1.6443182229995728, |
|
"learning_rate": 1.9128884843658153e-05, |
|
"loss": 1.1258, |
|
"step": 51500 |
|
}, |
|
{ |
|
"epoch": 2.935320552932476, |
|
"grad_norm": 1.868161916732788, |
|
"learning_rate": 1.7611923317594857e-05, |
|
"loss": 1.115, |
|
"step": 51600 |
|
}, |
|
{ |
|
"epoch": 2.9410091586552136, |
|
"grad_norm": 1.5620206594467163, |
|
"learning_rate": 1.609496179153156e-05, |
|
"loss": 1.1075, |
|
"step": 51700 |
|
}, |
|
{ |
|
"epoch": 2.946697764377951, |
|
"grad_norm": 1.6326332092285156, |
|
"learning_rate": 1.4578000265468267e-05, |
|
"loss": 1.1113, |
|
"step": 51800 |
|
}, |
|
{ |
|
"epoch": 2.9523863701006885, |
|
"grad_norm": 1.805126428604126, |
|
"learning_rate": 1.3061038739404973e-05, |
|
"loss": 1.0937, |
|
"step": 51900 |
|
}, |
|
{ |
|
"epoch": 2.9580749758234255, |
|
"grad_norm": 1.5693707466125488, |
|
"learning_rate": 1.1544077213341677e-05, |
|
"loss": 1.1053, |
|
"step": 52000 |
|
}, |
|
{ |
|
"epoch": 2.963763581546163, |
|
"grad_norm": 1.4851309061050415, |
|
"learning_rate": 1.0027115687278382e-05, |
|
"loss": 1.1101, |
|
"step": 52100 |
|
}, |
|
{ |
|
"epoch": 2.9694521872689004, |
|
"grad_norm": 1.8946778774261475, |
|
"learning_rate": 8.510154161215086e-06, |
|
"loss": 1.0934, |
|
"step": 52200 |
|
}, |
|
{ |
|
"epoch": 2.975140792991638, |
|
"grad_norm": 1.5624499320983887, |
|
"learning_rate": 6.993192635151791e-06, |
|
"loss": 1.0994, |
|
"step": 52300 |
|
}, |
|
{ |
|
"epoch": 2.9808293987143752, |
|
"grad_norm": 1.5662641525268555, |
|
"learning_rate": 5.476231109088497e-06, |
|
"loss": 1.1058, |
|
"step": 52400 |
|
}, |
|
{ |
|
"epoch": 2.9865180044371122, |
|
"grad_norm": 1.514809250831604, |
|
"learning_rate": 3.959269583025201e-06, |
|
"loss": 1.0967, |
|
"step": 52500 |
|
}, |
|
{ |
|
"epoch": 2.9922066101598497, |
|
"grad_norm": 1.8442556858062744, |
|
"learning_rate": 2.4423080569619053e-06, |
|
"loss": 1.1041, |
|
"step": 52600 |
|
}, |
|
{ |
|
"epoch": 2.997895215882587, |
|
"grad_norm": 1.7445664405822754, |
|
"learning_rate": 9.253465308986101e-07, |
|
"loss": 1.0786, |
|
"step": 52700 |
|
}, |
|
{ |
|
"epoch": 3.0, |
|
"step": 52737, |
|
"total_flos": 1.3116020904e+17, |
|
"train_loss": 1.269093582550002, |
|
"train_runtime": 7252.1306, |
|
"train_samples_per_second": 1861.522, |
|
"train_steps_per_second": 7.272 |
|
} |
|
], |
|
"logging_steps": 100, |
|
"max_steps": 52737, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 3, |
|
"save_steps": 5000, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 1.3116020904e+17, |
|
"train_batch_size": 256, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|