|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 1.9972932091393998, |
|
"eval_steps": 100, |
|
"global_step": 784, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.012737839344001274, |
|
"grad_norm": 7.091875076293945, |
|
"learning_rate": 3.75e-05, |
|
"loss": 37.744, |
|
"step": 5 |
|
}, |
|
{ |
|
"epoch": 0.02547567868800255, |
|
"grad_norm": 2.930401563644409, |
|
"learning_rate": 7.5e-05, |
|
"loss": 34.0864, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.03821351803200382, |
|
"grad_norm": 1.8477588891983032, |
|
"learning_rate": 0.0001125, |
|
"loss": 31.2726, |
|
"step": 15 |
|
}, |
|
{ |
|
"epoch": 0.0509513573760051, |
|
"grad_norm": 1.3455390930175781, |
|
"learning_rate": 0.00015, |
|
"loss": 28.3763, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.06368919672000636, |
|
"grad_norm": 1.138717532157898, |
|
"learning_rate": 0.00018749999999999998, |
|
"loss": 26.957, |
|
"step": 25 |
|
}, |
|
{ |
|
"epoch": 0.07642703606400764, |
|
"grad_norm": 0.9747544527053833, |
|
"learning_rate": 0.000225, |
|
"loss": 24.4616, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.08916487540800892, |
|
"grad_norm": 0.9035225510597229, |
|
"learning_rate": 0.0002625, |
|
"loss": 22.5748, |
|
"step": 35 |
|
}, |
|
{ |
|
"epoch": 0.1019027147520102, |
|
"grad_norm": 0.7786006927490234, |
|
"learning_rate": 0.0003, |
|
"loss": 20.6574, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.11464055409601147, |
|
"grad_norm": 0.7649045586585999, |
|
"learning_rate": 0.0003, |
|
"loss": 18.9346, |
|
"step": 45 |
|
}, |
|
{ |
|
"epoch": 0.12737839344001273, |
|
"grad_norm": 0.6415356993675232, |
|
"learning_rate": 0.0003, |
|
"loss": 17.8129, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.140116232784014, |
|
"grad_norm": 0.5701594948768616, |
|
"learning_rate": 0.0003, |
|
"loss": 16.881, |
|
"step": 55 |
|
}, |
|
{ |
|
"epoch": 0.15285407212801527, |
|
"grad_norm": 0.49638187885284424, |
|
"learning_rate": 0.0003, |
|
"loss": 16.2049, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.16559191147201657, |
|
"grad_norm": 0.44346606731414795, |
|
"learning_rate": 0.0003, |
|
"loss": 15.9336, |
|
"step": 65 |
|
}, |
|
{ |
|
"epoch": 0.17832975081601785, |
|
"grad_norm": 0.4194740355014801, |
|
"learning_rate": 0.0003, |
|
"loss": 15.2473, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.19106759016001912, |
|
"grad_norm": 0.4130041301250458, |
|
"learning_rate": 0.0003, |
|
"loss": 15.1218, |
|
"step": 75 |
|
}, |
|
{ |
|
"epoch": 0.2038054295040204, |
|
"grad_norm": 0.40480196475982666, |
|
"learning_rate": 0.0003, |
|
"loss": 14.7839, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.21654326884802166, |
|
"grad_norm": 0.394378662109375, |
|
"learning_rate": 0.0003, |
|
"loss": 14.2312, |
|
"step": 85 |
|
}, |
|
{ |
|
"epoch": 0.22928110819202294, |
|
"grad_norm": 0.39825204014778137, |
|
"learning_rate": 0.0003, |
|
"loss": 13.9441, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.2420189475360242, |
|
"grad_norm": 0.38816991448402405, |
|
"learning_rate": 0.0003, |
|
"loss": 13.4799, |
|
"step": 95 |
|
}, |
|
{ |
|
"epoch": 0.25475678688002545, |
|
"grad_norm": 0.36586159467697144, |
|
"learning_rate": 0.0003, |
|
"loss": 13.3276, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.25475678688002545, |
|
"eval_accuracy": 0.013133919843597262, |
|
"eval_loss": 12.040165901184082, |
|
"eval_runtime": 14.4617, |
|
"eval_samples_per_second": 17.287, |
|
"eval_steps_per_second": 4.356, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.26749462622402675, |
|
"grad_norm": 0.40571218729019165, |
|
"learning_rate": 0.0003, |
|
"loss": 13.1015, |
|
"step": 105 |
|
}, |
|
{ |
|
"epoch": 0.280232465568028, |
|
"grad_norm": 0.3502795696258545, |
|
"learning_rate": 0.0003, |
|
"loss": 12.614, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.2929703049120293, |
|
"grad_norm": 0.33776018023490906, |
|
"learning_rate": 0.0003, |
|
"loss": 12.488, |
|
"step": 115 |
|
}, |
|
{ |
|
"epoch": 0.30570814425603055, |
|
"grad_norm": 0.3277961015701294, |
|
"learning_rate": 0.0003, |
|
"loss": 12.2282, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.31844598360003185, |
|
"grad_norm": 0.3399854898452759, |
|
"learning_rate": 0.0003, |
|
"loss": 12.0168, |
|
"step": 125 |
|
}, |
|
{ |
|
"epoch": 0.33118382294403315, |
|
"grad_norm": 0.31557145714759827, |
|
"learning_rate": 0.0003, |
|
"loss": 11.832, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.3439216622880344, |
|
"grad_norm": 0.32902857661247253, |
|
"learning_rate": 0.0003, |
|
"loss": 11.4818, |
|
"step": 135 |
|
}, |
|
{ |
|
"epoch": 0.3566595016320357, |
|
"grad_norm": 0.34518980979919434, |
|
"learning_rate": 0.0003, |
|
"loss": 11.3197, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.36939734097603694, |
|
"grad_norm": 0.32530176639556885, |
|
"learning_rate": 0.0003, |
|
"loss": 11.0346, |
|
"step": 145 |
|
}, |
|
{ |
|
"epoch": 0.38213518032003824, |
|
"grad_norm": 0.3253624141216278, |
|
"learning_rate": 0.0003, |
|
"loss": 10.6717, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.3948730196640395, |
|
"grad_norm": 0.33527347445487976, |
|
"learning_rate": 0.0003, |
|
"loss": 10.5302, |
|
"step": 155 |
|
}, |
|
{ |
|
"epoch": 0.4076108590080408, |
|
"grad_norm": 0.3164774477481842, |
|
"learning_rate": 0.0003, |
|
"loss": 10.2009, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.420348698352042, |
|
"grad_norm": 0.3047502934932709, |
|
"learning_rate": 0.0003, |
|
"loss": 10.1689, |
|
"step": 165 |
|
}, |
|
{ |
|
"epoch": 0.4330865376960433, |
|
"grad_norm": 0.31613191962242126, |
|
"learning_rate": 0.0003, |
|
"loss": 9.85, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.4458243770400446, |
|
"grad_norm": 0.3114412724971771, |
|
"learning_rate": 0.0003, |
|
"loss": 9.6662, |
|
"step": 175 |
|
}, |
|
{ |
|
"epoch": 0.4585622163840459, |
|
"grad_norm": 0.31863468885421753, |
|
"learning_rate": 0.0003, |
|
"loss": 9.4857, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.4713000557280471, |
|
"grad_norm": 0.3024883568286896, |
|
"learning_rate": 0.0003, |
|
"loss": 9.2409, |
|
"step": 185 |
|
}, |
|
{ |
|
"epoch": 0.4840378950720484, |
|
"grad_norm": 0.3118532598018646, |
|
"learning_rate": 0.0003, |
|
"loss": 9.156, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 0.49677573441604966, |
|
"grad_norm": 0.3026701807975769, |
|
"learning_rate": 0.0003, |
|
"loss": 9.0273, |
|
"step": 195 |
|
}, |
|
{ |
|
"epoch": 0.5095135737600509, |
|
"grad_norm": 0.3058376908302307, |
|
"learning_rate": 0.0003, |
|
"loss": 8.9207, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.5095135737600509, |
|
"eval_accuracy": 0.03601173020527859, |
|
"eval_loss": 8.031224250793457, |
|
"eval_runtime": 14.6886, |
|
"eval_samples_per_second": 17.02, |
|
"eval_steps_per_second": 4.289, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.5222514131040522, |
|
"grad_norm": 0.31776145100593567, |
|
"learning_rate": 0.0003, |
|
"loss": 8.819, |
|
"step": 205 |
|
}, |
|
{ |
|
"epoch": 0.5349892524480535, |
|
"grad_norm": 0.3050650656223297, |
|
"learning_rate": 0.0003, |
|
"loss": 8.7563, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 0.5477270917920548, |
|
"grad_norm": 0.31346216797828674, |
|
"learning_rate": 0.0003, |
|
"loss": 8.4781, |
|
"step": 215 |
|
}, |
|
{ |
|
"epoch": 0.560464931136056, |
|
"grad_norm": 0.3162192404270172, |
|
"learning_rate": 0.0003, |
|
"loss": 8.49, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 0.5732027704800573, |
|
"grad_norm": 0.2908290922641754, |
|
"learning_rate": 0.0003, |
|
"loss": 8.1487, |
|
"step": 225 |
|
}, |
|
{ |
|
"epoch": 0.5859406098240586, |
|
"grad_norm": 0.29553738236427307, |
|
"learning_rate": 0.0003, |
|
"loss": 8.2668, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 0.5986784491680599, |
|
"grad_norm": 0.288335919380188, |
|
"learning_rate": 0.0003, |
|
"loss": 8.1061, |
|
"step": 235 |
|
}, |
|
{ |
|
"epoch": 0.6114162885120611, |
|
"grad_norm": 0.30966615676879883, |
|
"learning_rate": 0.0003, |
|
"loss": 8.1297, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 0.6241541278560624, |
|
"grad_norm": 0.29941117763519287, |
|
"learning_rate": 0.0003, |
|
"loss": 7.8082, |
|
"step": 245 |
|
}, |
|
{ |
|
"epoch": 0.6368919672000637, |
|
"grad_norm": 0.29136765003204346, |
|
"learning_rate": 0.0003, |
|
"loss": 7.937, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.649629806544065, |
|
"grad_norm": 0.30150941014289856, |
|
"learning_rate": 0.0003, |
|
"loss": 7.7454, |
|
"step": 255 |
|
}, |
|
{ |
|
"epoch": 0.6623676458880663, |
|
"grad_norm": 0.28709036111831665, |
|
"learning_rate": 0.0003, |
|
"loss": 7.8069, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 0.6751054852320675, |
|
"grad_norm": 0.31939393281936646, |
|
"learning_rate": 0.0003, |
|
"loss": 7.631, |
|
"step": 265 |
|
}, |
|
{ |
|
"epoch": 0.6878433245760688, |
|
"grad_norm": 0.29692211747169495, |
|
"learning_rate": 0.0003, |
|
"loss": 7.6632, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 0.7005811639200701, |
|
"grad_norm": 0.3304164409637451, |
|
"learning_rate": 0.0003, |
|
"loss": 7.4727, |
|
"step": 275 |
|
}, |
|
{ |
|
"epoch": 0.7133190032640714, |
|
"grad_norm": 0.28332462906837463, |
|
"learning_rate": 0.0003, |
|
"loss": 7.4796, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 0.7260568426080726, |
|
"grad_norm": 0.2897827625274658, |
|
"learning_rate": 0.0003, |
|
"loss": 7.5389, |
|
"step": 285 |
|
}, |
|
{ |
|
"epoch": 0.7387946819520739, |
|
"grad_norm": 0.2887686491012573, |
|
"learning_rate": 0.0003, |
|
"loss": 7.382, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 0.7515325212960752, |
|
"grad_norm": 0.3093564212322235, |
|
"learning_rate": 0.0003, |
|
"loss": 7.2586, |
|
"step": 295 |
|
}, |
|
{ |
|
"epoch": 0.7642703606400765, |
|
"grad_norm": 0.2902717590332031, |
|
"learning_rate": 0.0003, |
|
"loss": 7.2681, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.7642703606400765, |
|
"eval_accuracy": 0.050643206256109484, |
|
"eval_loss": 6.477533340454102, |
|
"eval_runtime": 14.6327, |
|
"eval_samples_per_second": 17.085, |
|
"eval_steps_per_second": 4.305, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.7770081999840777, |
|
"grad_norm": 0.2867899239063263, |
|
"learning_rate": 0.0003, |
|
"loss": 7.0712, |
|
"step": 305 |
|
}, |
|
{ |
|
"epoch": 0.789746039328079, |
|
"grad_norm": 0.27321040630340576, |
|
"learning_rate": 0.0003, |
|
"loss": 7.0524, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 0.8024838786720803, |
|
"grad_norm": 0.3487064242362976, |
|
"learning_rate": 0.0003, |
|
"loss": 7.0939, |
|
"step": 315 |
|
}, |
|
{ |
|
"epoch": 0.8152217180160816, |
|
"grad_norm": 0.329608291387558, |
|
"learning_rate": 0.0003, |
|
"loss": 6.9997, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 0.8279595573600828, |
|
"grad_norm": 0.3154338300228119, |
|
"learning_rate": 0.0003, |
|
"loss": 6.9663, |
|
"step": 325 |
|
}, |
|
{ |
|
"epoch": 0.840697396704084, |
|
"grad_norm": 0.31021803617477417, |
|
"learning_rate": 0.0003, |
|
"loss": 6.7821, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 0.8534352360480854, |
|
"grad_norm": 0.388336181640625, |
|
"learning_rate": 0.0003, |
|
"loss": 6.7751, |
|
"step": 335 |
|
}, |
|
{ |
|
"epoch": 0.8661730753920867, |
|
"grad_norm": 0.31887954473495483, |
|
"learning_rate": 0.0003, |
|
"loss": 6.702, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 0.8789109147360878, |
|
"grad_norm": 0.31558957695961, |
|
"learning_rate": 0.0003, |
|
"loss": 6.6206, |
|
"step": 345 |
|
}, |
|
{ |
|
"epoch": 0.8916487540800891, |
|
"grad_norm": 0.30751529335975647, |
|
"learning_rate": 0.0003, |
|
"loss": 6.7077, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 0.9043865934240904, |
|
"grad_norm": 0.33058232069015503, |
|
"learning_rate": 0.0003, |
|
"loss": 6.557, |
|
"step": 355 |
|
}, |
|
{ |
|
"epoch": 0.9171244327680917, |
|
"grad_norm": 0.3375111222267151, |
|
"learning_rate": 0.0003, |
|
"loss": 6.6369, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 0.9298622721120929, |
|
"grad_norm": 0.3047392964363098, |
|
"learning_rate": 0.0003, |
|
"loss": 6.5796, |
|
"step": 365 |
|
}, |
|
{ |
|
"epoch": 0.9426001114560942, |
|
"grad_norm": 0.430053174495697, |
|
"learning_rate": 0.0003, |
|
"loss": 6.5548, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 0.9553379508000955, |
|
"grad_norm": 0.3610515296459198, |
|
"learning_rate": 0.0003, |
|
"loss": 6.4576, |
|
"step": 375 |
|
}, |
|
{ |
|
"epoch": 0.9680757901440968, |
|
"grad_norm": 0.32095110416412354, |
|
"learning_rate": 0.0003, |
|
"loss": 6.4266, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 0.980813629488098, |
|
"grad_norm": 0.32170969247817993, |
|
"learning_rate": 0.0003, |
|
"loss": 6.5597, |
|
"step": 385 |
|
}, |
|
{ |
|
"epoch": 0.9935514688320993, |
|
"grad_norm": 0.29942792654037476, |
|
"learning_rate": 0.0003, |
|
"loss": 6.3873, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 1.0062893081761006, |
|
"grad_norm": 0.2971299886703491, |
|
"learning_rate": 0.0003, |
|
"loss": 6.3915, |
|
"step": 395 |
|
}, |
|
{ |
|
"epoch": 1.0190271475201018, |
|
"grad_norm": 0.2800815999507904, |
|
"learning_rate": 0.0003, |
|
"loss": 6.3187, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 1.0190271475201018, |
|
"eval_accuracy": 0.0433822091886608, |
|
"eval_loss": 5.622740268707275, |
|
"eval_runtime": 14.4103, |
|
"eval_samples_per_second": 17.349, |
|
"eval_steps_per_second": 4.372, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 1.0317649868641032, |
|
"grad_norm": 0.28819501399993896, |
|
"learning_rate": 0.0003, |
|
"loss": 6.328, |
|
"step": 405 |
|
}, |
|
{ |
|
"epoch": 1.0445028262081044, |
|
"grad_norm": 0.3983236849308014, |
|
"learning_rate": 0.0003, |
|
"loss": 6.3988, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 1.0572406655521058, |
|
"grad_norm": 0.2969406545162201, |
|
"learning_rate": 0.0003, |
|
"loss": 6.2509, |
|
"step": 415 |
|
}, |
|
{ |
|
"epoch": 1.069978504896107, |
|
"grad_norm": 0.2973212003707886, |
|
"learning_rate": 0.0003, |
|
"loss": 6.1234, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 1.0827163442401082, |
|
"grad_norm": 0.3298945426940918, |
|
"learning_rate": 0.0003, |
|
"loss": 6.3219, |
|
"step": 425 |
|
}, |
|
{ |
|
"epoch": 1.0954541835841096, |
|
"grad_norm": 0.3493943214416504, |
|
"learning_rate": 0.0003, |
|
"loss": 6.0888, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 1.1081920229281108, |
|
"grad_norm": 0.3639209270477295, |
|
"learning_rate": 0.0003, |
|
"loss": 6.2226, |
|
"step": 435 |
|
}, |
|
{ |
|
"epoch": 1.120929862272112, |
|
"grad_norm": 0.43913957476615906, |
|
"learning_rate": 0.0003, |
|
"loss": 6.0308, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 1.1336677016161134, |
|
"grad_norm": 0.43267834186553955, |
|
"learning_rate": 0.0003, |
|
"loss": 6.0806, |
|
"step": 445 |
|
}, |
|
{ |
|
"epoch": 1.1464055409601146, |
|
"grad_norm": 0.4563148021697998, |
|
"learning_rate": 0.0003, |
|
"loss": 5.9703, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 1.159143380304116, |
|
"grad_norm": 0.4002761244773865, |
|
"learning_rate": 0.0003, |
|
"loss": 5.9163, |
|
"step": 455 |
|
}, |
|
{ |
|
"epoch": 1.1718812196481172, |
|
"grad_norm": 0.4359826147556305, |
|
"learning_rate": 0.0003, |
|
"loss": 5.8285, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 1.1846190589921184, |
|
"grad_norm": 0.5450247526168823, |
|
"learning_rate": 0.0003, |
|
"loss": 5.8063, |
|
"step": 465 |
|
}, |
|
{ |
|
"epoch": 1.1973568983361198, |
|
"grad_norm": 0.3597274422645569, |
|
"learning_rate": 0.0003, |
|
"loss": 5.6978, |
|
"step": 470 |
|
}, |
|
{ |
|
"epoch": 1.210094737680121, |
|
"grad_norm": 0.4141215980052948, |
|
"learning_rate": 0.0003, |
|
"loss": 5.6078, |
|
"step": 475 |
|
}, |
|
{ |
|
"epoch": 1.2228325770241222, |
|
"grad_norm": 0.3695543110370636, |
|
"learning_rate": 0.0003, |
|
"loss": 5.6728, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 1.2355704163681236, |
|
"grad_norm": 0.5060051083564758, |
|
"learning_rate": 0.0003, |
|
"loss": 5.6049, |
|
"step": 485 |
|
}, |
|
{ |
|
"epoch": 1.2483082557121248, |
|
"grad_norm": 0.5355808138847351, |
|
"learning_rate": 0.0003, |
|
"loss": 5.6564, |
|
"step": 490 |
|
}, |
|
{ |
|
"epoch": 1.261046095056126, |
|
"grad_norm": 0.4578459858894348, |
|
"learning_rate": 0.0003, |
|
"loss": 5.5758, |
|
"step": 495 |
|
}, |
|
{ |
|
"epoch": 1.2737839344001274, |
|
"grad_norm": 0.4868403673171997, |
|
"learning_rate": 0.0003, |
|
"loss": 5.5695, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 1.2737839344001274, |
|
"eval_accuracy": 0.36348778103616813, |
|
"eval_loss": 4.77961540222168, |
|
"eval_runtime": 14.5581, |
|
"eval_samples_per_second": 17.173, |
|
"eval_steps_per_second": 4.328, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 1.2865217737441286, |
|
"grad_norm": 0.550255298614502, |
|
"learning_rate": 0.0003, |
|
"loss": 5.5591, |
|
"step": 505 |
|
}, |
|
{ |
|
"epoch": 1.29925961308813, |
|
"grad_norm": 0.5515110492706299, |
|
"learning_rate": 0.0003, |
|
"loss": 5.4588, |
|
"step": 510 |
|
}, |
|
{ |
|
"epoch": 1.3119974524321312, |
|
"grad_norm": 0.44656914472579956, |
|
"learning_rate": 0.0003, |
|
"loss": 5.4336, |
|
"step": 515 |
|
}, |
|
{ |
|
"epoch": 1.3247352917761326, |
|
"grad_norm": 0.5925999283790588, |
|
"learning_rate": 0.0003, |
|
"loss": 5.5185, |
|
"step": 520 |
|
}, |
|
{ |
|
"epoch": 1.3374731311201338, |
|
"grad_norm": 0.632453203201294, |
|
"learning_rate": 0.0003, |
|
"loss": 5.325, |
|
"step": 525 |
|
}, |
|
{ |
|
"epoch": 1.350210970464135, |
|
"grad_norm": 0.5380024909973145, |
|
"learning_rate": 0.0003, |
|
"loss": 5.4005, |
|
"step": 530 |
|
}, |
|
{ |
|
"epoch": 1.3629488098081364, |
|
"grad_norm": 0.5659191012382507, |
|
"learning_rate": 0.0003, |
|
"loss": 5.3564, |
|
"step": 535 |
|
}, |
|
{ |
|
"epoch": 1.3756866491521376, |
|
"grad_norm": 0.8913821578025818, |
|
"learning_rate": 0.0003, |
|
"loss": 5.2763, |
|
"step": 540 |
|
}, |
|
{ |
|
"epoch": 1.3884244884961388, |
|
"grad_norm": 0.9271002411842346, |
|
"learning_rate": 0.0003, |
|
"loss": 5.4129, |
|
"step": 545 |
|
}, |
|
{ |
|
"epoch": 1.4011623278401402, |
|
"grad_norm": 0.7141408324241638, |
|
"learning_rate": 0.0003, |
|
"loss": 5.4437, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 1.4139001671841414, |
|
"grad_norm": 0.5360827445983887, |
|
"learning_rate": 0.0003, |
|
"loss": 5.3523, |
|
"step": 555 |
|
}, |
|
{ |
|
"epoch": 1.4266380065281425, |
|
"grad_norm": 0.6563194990158081, |
|
"learning_rate": 0.0003, |
|
"loss": 5.1103, |
|
"step": 560 |
|
}, |
|
{ |
|
"epoch": 1.439375845872144, |
|
"grad_norm": 0.6325790882110596, |
|
"learning_rate": 0.0003, |
|
"loss": 5.4026, |
|
"step": 565 |
|
}, |
|
{ |
|
"epoch": 1.4521136852161451, |
|
"grad_norm": 0.8463213443756104, |
|
"learning_rate": 0.0003, |
|
"loss": 5.3129, |
|
"step": 570 |
|
}, |
|
{ |
|
"epoch": 1.4648515245601466, |
|
"grad_norm": 0.8394812345504761, |
|
"learning_rate": 0.0003, |
|
"loss": 5.3415, |
|
"step": 575 |
|
}, |
|
{ |
|
"epoch": 1.4775893639041477, |
|
"grad_norm": 0.692244291305542, |
|
"learning_rate": 0.0003, |
|
"loss": 5.2649, |
|
"step": 580 |
|
}, |
|
{ |
|
"epoch": 1.4903272032481492, |
|
"grad_norm": 0.6197806000709534, |
|
"learning_rate": 0.0003, |
|
"loss": 5.112, |
|
"step": 585 |
|
}, |
|
{ |
|
"epoch": 1.5030650425921503, |
|
"grad_norm": 0.6573797464370728, |
|
"learning_rate": 0.0003, |
|
"loss": 5.1669, |
|
"step": 590 |
|
}, |
|
{ |
|
"epoch": 1.5158028819361515, |
|
"grad_norm": 0.795892059803009, |
|
"learning_rate": 0.0003, |
|
"loss": 5.1693, |
|
"step": 595 |
|
}, |
|
{ |
|
"epoch": 1.528540721280153, |
|
"grad_norm": 0.6279253363609314, |
|
"learning_rate": 0.0003, |
|
"loss": 5.2926, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 1.528540721280153, |
|
"eval_accuracy": 0.3952492668621701, |
|
"eval_loss": 4.392324447631836, |
|
"eval_runtime": 14.409, |
|
"eval_samples_per_second": 17.35, |
|
"eval_steps_per_second": 4.372, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 1.5412785606241541, |
|
"grad_norm": 0.5762287378311157, |
|
"learning_rate": 0.0003, |
|
"loss": 5.0475, |
|
"step": 605 |
|
}, |
|
{ |
|
"epoch": 1.5540163999681553, |
|
"grad_norm": 0.5149503350257874, |
|
"learning_rate": 0.0003, |
|
"loss": 5.1185, |
|
"step": 610 |
|
}, |
|
{ |
|
"epoch": 1.5667542393121567, |
|
"grad_norm": 0.581633985042572, |
|
"learning_rate": 0.0003, |
|
"loss": 5.1166, |
|
"step": 615 |
|
}, |
|
{ |
|
"epoch": 1.579492078656158, |
|
"grad_norm": 0.5910624861717224, |
|
"learning_rate": 0.0003, |
|
"loss": 4.9907, |
|
"step": 620 |
|
}, |
|
{ |
|
"epoch": 1.5922299180001591, |
|
"grad_norm": 0.8280585408210754, |
|
"learning_rate": 0.0003, |
|
"loss": 5.0748, |
|
"step": 625 |
|
}, |
|
{ |
|
"epoch": 1.6049677573441605, |
|
"grad_norm": 0.5128599405288696, |
|
"learning_rate": 0.0003, |
|
"loss": 4.9768, |
|
"step": 630 |
|
}, |
|
{ |
|
"epoch": 1.6177055966881617, |
|
"grad_norm": 0.7540919184684753, |
|
"learning_rate": 0.0003, |
|
"loss": 5.0806, |
|
"step": 635 |
|
}, |
|
{ |
|
"epoch": 1.630443436032163, |
|
"grad_norm": 0.6239334940910339, |
|
"learning_rate": 0.0003, |
|
"loss": 5.1277, |
|
"step": 640 |
|
}, |
|
{ |
|
"epoch": 1.6431812753761643, |
|
"grad_norm": 0.7787991166114807, |
|
"learning_rate": 0.0003, |
|
"loss": 5.0778, |
|
"step": 645 |
|
}, |
|
{ |
|
"epoch": 1.6559191147201657, |
|
"grad_norm": 0.6328299641609192, |
|
"learning_rate": 0.0003, |
|
"loss": 4.9763, |
|
"step": 650 |
|
}, |
|
{ |
|
"epoch": 1.668656954064167, |
|
"grad_norm": 0.5455794334411621, |
|
"learning_rate": 0.0003, |
|
"loss": 5.0049, |
|
"step": 655 |
|
}, |
|
{ |
|
"epoch": 1.681394793408168, |
|
"grad_norm": 0.7078703045845032, |
|
"learning_rate": 0.0003, |
|
"loss": 5.0258, |
|
"step": 660 |
|
}, |
|
{ |
|
"epoch": 1.6941326327521695, |
|
"grad_norm": 0.6339858770370483, |
|
"learning_rate": 0.0003, |
|
"loss": 5.1028, |
|
"step": 665 |
|
}, |
|
{ |
|
"epoch": 1.7068704720961707, |
|
"grad_norm": 0.6060242652893066, |
|
"learning_rate": 0.0003, |
|
"loss": 5.0428, |
|
"step": 670 |
|
}, |
|
{ |
|
"epoch": 1.719608311440172, |
|
"grad_norm": 0.9218889474868774, |
|
"learning_rate": 0.0003, |
|
"loss": 4.9891, |
|
"step": 675 |
|
}, |
|
{ |
|
"epoch": 1.7323461507841733, |
|
"grad_norm": 0.6890697479248047, |
|
"learning_rate": 0.0003, |
|
"loss": 4.8921, |
|
"step": 680 |
|
}, |
|
{ |
|
"epoch": 1.7450839901281745, |
|
"grad_norm": 0.9093934297561646, |
|
"learning_rate": 0.0003, |
|
"loss": 4.9385, |
|
"step": 685 |
|
}, |
|
{ |
|
"epoch": 1.7578218294721757, |
|
"grad_norm": 0.5929202437400818, |
|
"learning_rate": 0.0003, |
|
"loss": 4.9376, |
|
"step": 690 |
|
}, |
|
{ |
|
"epoch": 1.770559668816177, |
|
"grad_norm": 0.6317362189292908, |
|
"learning_rate": 0.0003, |
|
"loss": 4.9681, |
|
"step": 695 |
|
}, |
|
{ |
|
"epoch": 1.7832975081601783, |
|
"grad_norm": 0.5537763237953186, |
|
"learning_rate": 0.0003, |
|
"loss": 4.878, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 1.7832975081601783, |
|
"eval_accuracy": 0.40849266862170086, |
|
"eval_loss": 4.187656402587891, |
|
"eval_runtime": 14.4745, |
|
"eval_samples_per_second": 17.272, |
|
"eval_steps_per_second": 4.352, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 1.7960353475041795, |
|
"grad_norm": 0.5984592437744141, |
|
"learning_rate": 0.0003, |
|
"loss": 4.9092, |
|
"step": 705 |
|
}, |
|
{ |
|
"epoch": 1.808773186848181, |
|
"grad_norm": 0.5060558915138245, |
|
"learning_rate": 0.0003, |
|
"loss": 4.989, |
|
"step": 710 |
|
}, |
|
{ |
|
"epoch": 1.8215110261921823, |
|
"grad_norm": 0.8713288903236389, |
|
"learning_rate": 0.0003, |
|
"loss": 4.8114, |
|
"step": 715 |
|
}, |
|
{ |
|
"epoch": 1.8342488655361833, |
|
"grad_norm": 0.8011664748191833, |
|
"learning_rate": 0.0003, |
|
"loss": 4.8468, |
|
"step": 720 |
|
}, |
|
{ |
|
"epoch": 1.8469867048801847, |
|
"grad_norm": 0.6774628758430481, |
|
"learning_rate": 0.0003, |
|
"loss": 4.8899, |
|
"step": 725 |
|
}, |
|
{ |
|
"epoch": 1.859724544224186, |
|
"grad_norm": 1.05668044090271, |
|
"learning_rate": 0.0003, |
|
"loss": 4.8676, |
|
"step": 730 |
|
}, |
|
{ |
|
"epoch": 1.8724623835681873, |
|
"grad_norm": 0.8638430237770081, |
|
"learning_rate": 0.0003, |
|
"loss": 4.8515, |
|
"step": 735 |
|
}, |
|
{ |
|
"epoch": 1.8852002229121885, |
|
"grad_norm": 0.8210180997848511, |
|
"learning_rate": 0.0003, |
|
"loss": 4.9094, |
|
"step": 740 |
|
}, |
|
{ |
|
"epoch": 1.8979380622561899, |
|
"grad_norm": 0.6894564032554626, |
|
"learning_rate": 0.0003, |
|
"loss": 4.8406, |
|
"step": 745 |
|
}, |
|
{ |
|
"epoch": 1.910675901600191, |
|
"grad_norm": 0.7244303822517395, |
|
"learning_rate": 0.0003, |
|
"loss": 4.8299, |
|
"step": 750 |
|
}, |
|
{ |
|
"epoch": 1.9234137409441923, |
|
"grad_norm": 0.5788025856018066, |
|
"learning_rate": 0.0003, |
|
"loss": 4.8843, |
|
"step": 755 |
|
}, |
|
{ |
|
"epoch": 1.9361515802881937, |
|
"grad_norm": 0.5082942843437195, |
|
"learning_rate": 0.0003, |
|
"loss": 4.7624, |
|
"step": 760 |
|
}, |
|
{ |
|
"epoch": 1.9488894196321949, |
|
"grad_norm": 0.6290297508239746, |
|
"learning_rate": 0.0003, |
|
"loss": 4.7709, |
|
"step": 765 |
|
}, |
|
{ |
|
"epoch": 1.961627258976196, |
|
"grad_norm": 0.5582670569419861, |
|
"learning_rate": 0.0003, |
|
"loss": 4.7169, |
|
"step": 770 |
|
}, |
|
{ |
|
"epoch": 1.9743650983201975, |
|
"grad_norm": 0.6051950454711914, |
|
"learning_rate": 0.0003, |
|
"loss": 4.7701, |
|
"step": 775 |
|
}, |
|
{ |
|
"epoch": 1.9871029376641989, |
|
"grad_norm": 0.6427810788154602, |
|
"learning_rate": 0.0003, |
|
"loss": 4.7729, |
|
"step": 780 |
|
}, |
|
{ |
|
"epoch": 1.9972932091393998, |
|
"step": 784, |
|
"total_flos": 6.247688798679859e+16, |
|
"train_loss": 8.457253451250038, |
|
"train_runtime": 65851.1603, |
|
"train_samples_per_second": 1.526, |
|
"train_steps_per_second": 0.012 |
|
} |
|
], |
|
"logging_steps": 5, |
|
"max_steps": 784, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 2, |
|
"save_steps": 100, |
|
"total_flos": 6.247688798679859e+16, |
|
"train_batch_size": 4, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|