|
{ |
|
"best_metric": 1.9750508069992065, |
|
"best_model_checkpoint": "./gemma-python/checkpoint-40", |
|
"epoch": 10.0, |
|
"eval_steps": 2, |
|
"global_step": 80, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.12, |
|
"grad_norm": 40.636978402335416, |
|
"learning_rate": 0.0001, |
|
"loss": 19.0016, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.12, |
|
"eval_loss": 18.6992130279541, |
|
"eval_runtime": 2.881, |
|
"eval_samples_per_second": 7.289, |
|
"eval_steps_per_second": 1.041, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.25, |
|
"grad_norm": 41.61053527062362, |
|
"learning_rate": 0.0002, |
|
"loss": 19.4686, |
|
"step": 2 |
|
}, |
|
{ |
|
"epoch": 0.25, |
|
"eval_loss": 16.257802963256836, |
|
"eval_runtime": 2.9111, |
|
"eval_samples_per_second": 7.214, |
|
"eval_steps_per_second": 1.031, |
|
"step": 2 |
|
}, |
|
{ |
|
"epoch": 0.38, |
|
"grad_norm": 28.704819713850974, |
|
"learning_rate": 0.00019991889981715698, |
|
"loss": 13.2303, |
|
"step": 3 |
|
}, |
|
{ |
|
"epoch": 0.5, |
|
"grad_norm": 26.40444243073739, |
|
"learning_rate": 0.00019967573081342103, |
|
"loss": 11.468, |
|
"step": 4 |
|
}, |
|
{ |
|
"epoch": 0.5, |
|
"eval_loss": 8.28911018371582, |
|
"eval_runtime": 2.9257, |
|
"eval_samples_per_second": 7.178, |
|
"eval_steps_per_second": 1.025, |
|
"step": 4 |
|
}, |
|
{ |
|
"epoch": 0.62, |
|
"grad_norm": 12.912981323843146, |
|
"learning_rate": 0.0001992708874098054, |
|
"loss": 9.3107, |
|
"step": 5 |
|
}, |
|
{ |
|
"epoch": 0.75, |
|
"grad_norm": 7.943058500648636, |
|
"learning_rate": 0.00019870502626379127, |
|
"loss": 7.5305, |
|
"step": 6 |
|
}, |
|
{ |
|
"epoch": 0.75, |
|
"eval_loss": 5.884701728820801, |
|
"eval_runtime": 2.9479, |
|
"eval_samples_per_second": 7.124, |
|
"eval_steps_per_second": 1.018, |
|
"step": 6 |
|
}, |
|
{ |
|
"epoch": 0.88, |
|
"grad_norm": 6.267657551985817, |
|
"learning_rate": 0.00019797906520422677, |
|
"loss": 6.6492, |
|
"step": 7 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"grad_norm": 5.0825555341832365, |
|
"learning_rate": 0.0001970941817426052, |
|
"loss": 5.7572, |
|
"step": 8 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"eval_loss": 4.363473892211914, |
|
"eval_runtime": 2.9653, |
|
"eval_samples_per_second": 7.082, |
|
"eval_steps_per_second": 1.012, |
|
"step": 8 |
|
}, |
|
{ |
|
"epoch": 1.12, |
|
"grad_norm": 4.88565620317727, |
|
"learning_rate": 0.00019605181116313724, |
|
"loss": 4.5414, |
|
"step": 9 |
|
}, |
|
{ |
|
"epoch": 1.25, |
|
"grad_norm": 5.0847008955317605, |
|
"learning_rate": 0.00019485364419471454, |
|
"loss": 4.3903, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 1.25, |
|
"eval_loss": 3.284867763519287, |
|
"eval_runtime": 2.9746, |
|
"eval_samples_per_second": 7.06, |
|
"eval_steps_per_second": 1.009, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 1.38, |
|
"grad_norm": 3.424587898800574, |
|
"learning_rate": 0.0001935016242685415, |
|
"loss": 3.79, |
|
"step": 11 |
|
}, |
|
{ |
|
"epoch": 1.5, |
|
"grad_norm": 2.7255824385278506, |
|
"learning_rate": 0.00019199794436588243, |
|
"loss": 2.9497, |
|
"step": 12 |
|
}, |
|
{ |
|
"epoch": 1.5, |
|
"eval_loss": 2.853942394256592, |
|
"eval_runtime": 2.9866, |
|
"eval_samples_per_second": 7.031, |
|
"eval_steps_per_second": 1.004, |
|
"step": 12 |
|
}, |
|
{ |
|
"epoch": 1.62, |
|
"grad_norm": 2.1001906898750624, |
|
"learning_rate": 0.00019034504346103823, |
|
"loss": 2.7728, |
|
"step": 13 |
|
}, |
|
{ |
|
"epoch": 1.75, |
|
"grad_norm": 1.9200021565941778, |
|
"learning_rate": 0.000188545602565321, |
|
"loss": 2.8738, |
|
"step": 14 |
|
}, |
|
{ |
|
"epoch": 1.75, |
|
"eval_loss": 2.62028431892395, |
|
"eval_runtime": 2.9982, |
|
"eval_samples_per_second": 7.004, |
|
"eval_steps_per_second": 1.001, |
|
"step": 14 |
|
}, |
|
{ |
|
"epoch": 1.88, |
|
"grad_norm": 1.8837224890225774, |
|
"learning_rate": 0.00018660254037844388, |
|
"loss": 3.0787, |
|
"step": 15 |
|
}, |
|
{ |
|
"epoch": 2.0, |
|
"grad_norm": 1.8929687978608318, |
|
"learning_rate": 0.0001845190085543795, |
|
"loss": 2.7298, |
|
"step": 16 |
|
}, |
|
{ |
|
"epoch": 2.0, |
|
"eval_loss": 2.453444242477417, |
|
"eval_runtime": 2.9964, |
|
"eval_samples_per_second": 7.008, |
|
"eval_steps_per_second": 1.001, |
|
"step": 16 |
|
}, |
|
{ |
|
"epoch": 2.12, |
|
"grad_norm": 1.3652069569291694, |
|
"learning_rate": 0.00018229838658936564, |
|
"loss": 2.5967, |
|
"step": 17 |
|
}, |
|
{ |
|
"epoch": 2.25, |
|
"grad_norm": 2.4263600812149417, |
|
"learning_rate": 0.00017994427634035015, |
|
"loss": 2.4284, |
|
"step": 18 |
|
}, |
|
{ |
|
"epoch": 2.25, |
|
"eval_loss": 2.307706832885742, |
|
"eval_runtime": 2.9963, |
|
"eval_samples_per_second": 7.009, |
|
"eval_steps_per_second": 1.001, |
|
"step": 18 |
|
}, |
|
{ |
|
"epoch": 2.38, |
|
"grad_norm": 2.5673391658400053, |
|
"learning_rate": 0.00017746049618276545, |
|
"loss": 2.6721, |
|
"step": 19 |
|
}, |
|
{ |
|
"epoch": 2.5, |
|
"grad_norm": 2.2252437500899656, |
|
"learning_rate": 0.00017485107481711012, |
|
"loss": 2.394, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 2.5, |
|
"eval_loss": 2.187636137008667, |
|
"eval_runtime": 2.9975, |
|
"eval_samples_per_second": 7.006, |
|
"eval_steps_per_second": 1.001, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 2.62, |
|
"grad_norm": 2.345233295279928, |
|
"learning_rate": 0.00017212024473438147, |
|
"loss": 2.3972, |
|
"step": 21 |
|
}, |
|
{ |
|
"epoch": 2.75, |
|
"grad_norm": 1.1122620317353238, |
|
"learning_rate": 0.00016927243535095997, |
|
"loss": 2.069, |
|
"step": 22 |
|
}, |
|
{ |
|
"epoch": 2.75, |
|
"eval_loss": 2.1294100284576416, |
|
"eval_runtime": 2.993, |
|
"eval_samples_per_second": 7.016, |
|
"eval_steps_per_second": 1.002, |
|
"step": 22 |
|
}, |
|
{ |
|
"epoch": 2.88, |
|
"grad_norm": 2.8270209249093803, |
|
"learning_rate": 0.00016631226582407952, |
|
"loss": 2.211, |
|
"step": 23 |
|
}, |
|
{ |
|
"epoch": 3.0, |
|
"grad_norm": 7.323169716541166, |
|
"learning_rate": 0.00016324453755953773, |
|
"loss": 1.9355, |
|
"step": 24 |
|
}, |
|
{ |
|
"epoch": 3.0, |
|
"eval_loss": 2.1047682762145996, |
|
"eval_runtime": 2.9871, |
|
"eval_samples_per_second": 7.03, |
|
"eval_steps_per_second": 1.004, |
|
"step": 24 |
|
}, |
|
{ |
|
"epoch": 3.12, |
|
"grad_norm": 1.9938311808450486, |
|
"learning_rate": 0.0001600742264237979, |
|
"loss": 2.1962, |
|
"step": 25 |
|
}, |
|
{ |
|
"epoch": 3.25, |
|
"grad_norm": 3.330986691029466, |
|
"learning_rate": 0.00015680647467311557, |
|
"loss": 1.9635, |
|
"step": 26 |
|
}, |
|
{ |
|
"epoch": 3.25, |
|
"eval_loss": 2.0707101821899414, |
|
"eval_runtime": 2.9895, |
|
"eval_samples_per_second": 7.025, |
|
"eval_steps_per_second": 1.004, |
|
"step": 26 |
|
}, |
|
{ |
|
"epoch": 3.38, |
|
"grad_norm": 2.0371854480792178, |
|
"learning_rate": 0.0001534465826127801, |
|
"loss": 2.2319, |
|
"step": 27 |
|
}, |
|
{ |
|
"epoch": 3.5, |
|
"grad_norm": 3.2163831286077653, |
|
"learning_rate": 0.00015000000000000001, |
|
"loss": 2.092, |
|
"step": 28 |
|
}, |
|
{ |
|
"epoch": 3.5, |
|
"eval_loss": 2.059619426727295, |
|
"eval_runtime": 2.9996, |
|
"eval_samples_per_second": 7.001, |
|
"eval_steps_per_second": 1.0, |
|
"step": 28 |
|
}, |
|
{ |
|
"epoch": 3.62, |
|
"grad_norm": 2.853987323853131, |
|
"learning_rate": 0.00014647231720437686, |
|
"loss": 1.9182, |
|
"step": 29 |
|
}, |
|
{ |
|
"epoch": 3.75, |
|
"grad_norm": 2.2997509863024352, |
|
"learning_rate": 0.00014286925614030542, |
|
"loss": 1.9675, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 3.75, |
|
"eval_loss": 2.0287458896636963, |
|
"eval_runtime": 2.9966, |
|
"eval_samples_per_second": 7.008, |
|
"eval_steps_per_second": 1.001, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 3.88, |
|
"grad_norm": 2.2770679758385244, |
|
"learning_rate": 0.00013919666098600753, |
|
"loss": 1.9815, |
|
"step": 31 |
|
}, |
|
{ |
|
"epoch": 4.0, |
|
"grad_norm": 0.8553765652252152, |
|
"learning_rate": 0.00013546048870425356, |
|
"loss": 1.9693, |
|
"step": 32 |
|
}, |
|
{ |
|
"epoch": 4.0, |
|
"eval_loss": 2.022012710571289, |
|
"eval_runtime": 2.9895, |
|
"eval_samples_per_second": 7.025, |
|
"eval_steps_per_second": 1.004, |
|
"step": 32 |
|
}, |
|
{ |
|
"epoch": 4.12, |
|
"grad_norm": 3.8094922067262336, |
|
"learning_rate": 0.00013166679938014726, |
|
"loss": 1.6479, |
|
"step": 33 |
|
}, |
|
{ |
|
"epoch": 4.25, |
|
"grad_norm": 3.5435911597121277, |
|
"learning_rate": 0.0001278217463916453, |
|
"loss": 2.0198, |
|
"step": 34 |
|
}, |
|
{ |
|
"epoch": 4.25, |
|
"eval_loss": 2.012432336807251, |
|
"eval_runtime": 2.9987, |
|
"eval_samples_per_second": 7.003, |
|
"eval_steps_per_second": 1.0, |
|
"step": 34 |
|
}, |
|
{ |
|
"epoch": 4.38, |
|
"grad_norm": 1.4676241516417539, |
|
"learning_rate": 0.0001239315664287558, |
|
"loss": 1.7496, |
|
"step": 35 |
|
}, |
|
{ |
|
"epoch": 4.5, |
|
"grad_norm": 1.4772602834377506, |
|
"learning_rate": 0.00012000256937760445, |
|
"loss": 1.9357, |
|
"step": 36 |
|
}, |
|
{ |
|
"epoch": 4.5, |
|
"eval_loss": 1.9945744276046753, |
|
"eval_runtime": 3.0019, |
|
"eval_samples_per_second": 6.995, |
|
"eval_steps_per_second": 0.999, |
|
"step": 36 |
|
}, |
|
{ |
|
"epoch": 4.62, |
|
"grad_norm": 0.8198622785029981, |
|
"learning_rate": 0.00011604112808577603, |
|
"loss": 1.8365, |
|
"step": 37 |
|
}, |
|
{ |
|
"epoch": 4.75, |
|
"grad_norm": 2.5267989029749556, |
|
"learning_rate": 0.0001120536680255323, |
|
"loss": 1.8147, |
|
"step": 38 |
|
}, |
|
{ |
|
"epoch": 4.75, |
|
"eval_loss": 1.9979486465454102, |
|
"eval_runtime": 2.9865, |
|
"eval_samples_per_second": 7.032, |
|
"eval_steps_per_second": 1.005, |
|
"step": 38 |
|
}, |
|
{ |
|
"epoch": 4.88, |
|
"grad_norm": 1.2889515222114942, |
|
"learning_rate": 0.00010804665687167262, |
|
"loss": 1.6703, |
|
"step": 39 |
|
}, |
|
{ |
|
"epoch": 5.0, |
|
"grad_norm": 1.3474067788797102, |
|
"learning_rate": 0.00010402659401094152, |
|
"loss": 1.9084, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 5.0, |
|
"eval_loss": 1.9750508069992065, |
|
"eval_runtime": 2.9945, |
|
"eval_samples_per_second": 7.013, |
|
"eval_steps_per_second": 1.002, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 5.12, |
|
"grad_norm": 1.320063776368443, |
|
"learning_rate": 0.0001, |
|
"loss": 1.6233, |
|
"step": 41 |
|
}, |
|
{ |
|
"epoch": 5.25, |
|
"grad_norm": 0.7858628087737163, |
|
"learning_rate": 9.597340598905852e-05, |
|
"loss": 1.6678, |
|
"step": 42 |
|
}, |
|
{ |
|
"epoch": 5.25, |
|
"eval_loss": 2.004897356033325, |
|
"eval_runtime": 2.9946, |
|
"eval_samples_per_second": 7.013, |
|
"eval_steps_per_second": 1.002, |
|
"step": 42 |
|
}, |
|
{ |
|
"epoch": 5.38, |
|
"grad_norm": 1.149181462350102, |
|
"learning_rate": 9.195334312832742e-05, |
|
"loss": 1.5673, |
|
"step": 43 |
|
}, |
|
{ |
|
"epoch": 5.5, |
|
"grad_norm": 1.961547695831496, |
|
"learning_rate": 8.79463319744677e-05, |
|
"loss": 1.7639, |
|
"step": 44 |
|
}, |
|
{ |
|
"epoch": 5.5, |
|
"eval_loss": 1.9885122776031494, |
|
"eval_runtime": 2.9905, |
|
"eval_samples_per_second": 7.022, |
|
"eval_steps_per_second": 1.003, |
|
"step": 44 |
|
}, |
|
{ |
|
"epoch": 5.62, |
|
"grad_norm": 0.794217334050356, |
|
"learning_rate": 8.395887191422397e-05, |
|
"loss": 1.6191, |
|
"step": 45 |
|
}, |
|
{ |
|
"epoch": 5.75, |
|
"grad_norm": 1.5568588659062292, |
|
"learning_rate": 7.999743062239557e-05, |
|
"loss": 1.7475, |
|
"step": 46 |
|
}, |
|
{ |
|
"epoch": 5.75, |
|
"eval_loss": 1.9777300357818604, |
|
"eval_runtime": 2.9821, |
|
"eval_samples_per_second": 7.042, |
|
"eval_steps_per_second": 1.006, |
|
"step": 46 |
|
}, |
|
{ |
|
"epoch": 5.88, |
|
"grad_norm": 0.9110203190054421, |
|
"learning_rate": 7.606843357124426e-05, |
|
"loss": 1.5998, |
|
"step": 47 |
|
}, |
|
{ |
|
"epoch": 6.0, |
|
"grad_norm": 1.4501990937976796, |
|
"learning_rate": 7.217825360835473e-05, |
|
"loss": 1.4848, |
|
"step": 48 |
|
}, |
|
{ |
|
"epoch": 6.0, |
|
"eval_loss": 1.9939006567001343, |
|
"eval_runtime": 2.9785, |
|
"eval_samples_per_second": 7.05, |
|
"eval_steps_per_second": 1.007, |
|
"step": 48 |
|
}, |
|
{ |
|
"epoch": 6.12, |
|
"grad_norm": 1.3413384555399062, |
|
"learning_rate": 6.833320061985277e-05, |
|
"loss": 1.5343, |
|
"step": 49 |
|
}, |
|
{ |
|
"epoch": 6.25, |
|
"grad_norm": 0.9844954583473513, |
|
"learning_rate": 6.453951129574644e-05, |
|
"loss": 1.3065, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 6.25, |
|
"eval_loss": 2.0264320373535156, |
|
"eval_runtime": 2.9839, |
|
"eval_samples_per_second": 7.038, |
|
"eval_steps_per_second": 1.005, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 6.38, |
|
"grad_norm": 1.268663878876962, |
|
"learning_rate": 6.080333901399251e-05, |
|
"loss": 1.4153, |
|
"step": 51 |
|
}, |
|
{ |
|
"epoch": 6.5, |
|
"grad_norm": 1.1638516740810099, |
|
"learning_rate": 5.713074385969457e-05, |
|
"loss": 1.4792, |
|
"step": 52 |
|
}, |
|
{ |
|
"epoch": 6.5, |
|
"eval_loss": 2.012540817260742, |
|
"eval_runtime": 2.9954, |
|
"eval_samples_per_second": 7.011, |
|
"eval_steps_per_second": 1.002, |
|
"step": 52 |
|
}, |
|
{ |
|
"epoch": 6.62, |
|
"grad_norm": 0.8956974540095054, |
|
"learning_rate": 5.3527682795623146e-05, |
|
"loss": 1.5184, |
|
"step": 53 |
|
}, |
|
{ |
|
"epoch": 6.75, |
|
"grad_norm": 0.8166104294104601, |
|
"learning_rate": 5.000000000000002e-05, |
|
"loss": 1.4233, |
|
"step": 54 |
|
}, |
|
{ |
|
"epoch": 6.75, |
|
"eval_loss": 2.0203704833984375, |
|
"eval_runtime": 2.9966, |
|
"eval_samples_per_second": 7.008, |
|
"eval_steps_per_second": 1.001, |
|
"step": 54 |
|
}, |
|
{ |
|
"epoch": 6.88, |
|
"grad_norm": 1.2567309830006292, |
|
"learning_rate": 4.6553417387219886e-05, |
|
"loss": 1.5766, |
|
"step": 55 |
|
}, |
|
{ |
|
"epoch": 7.0, |
|
"grad_norm": 1.202021898168564, |
|
"learning_rate": 4.3193525326884435e-05, |
|
"loss": 1.2534, |
|
"step": 56 |
|
}, |
|
{ |
|
"epoch": 7.0, |
|
"eval_loss": 2.0317745208740234, |
|
"eval_runtime": 2.9887, |
|
"eval_samples_per_second": 7.027, |
|
"eval_steps_per_second": 1.004, |
|
"step": 56 |
|
}, |
|
{ |
|
"epoch": 7.12, |
|
"grad_norm": 1.0179404054971375, |
|
"learning_rate": 3.99257735762021e-05, |
|
"loss": 1.3538, |
|
"step": 57 |
|
}, |
|
{ |
|
"epoch": 7.25, |
|
"grad_norm": 0.8024465225797554, |
|
"learning_rate": 3.675546244046228e-05, |
|
"loss": 1.2409, |
|
"step": 58 |
|
}, |
|
{ |
|
"epoch": 7.25, |
|
"eval_loss": 2.0444860458374023, |
|
"eval_runtime": 2.9957, |
|
"eval_samples_per_second": 7.01, |
|
"eval_steps_per_second": 1.001, |
|
"step": 58 |
|
}, |
|
{ |
|
"epoch": 7.38, |
|
"grad_norm": 1.0938821440297672, |
|
"learning_rate": 3.36877341759205e-05, |
|
"loss": 1.2446, |
|
"step": 59 |
|
}, |
|
{ |
|
"epoch": 7.5, |
|
"grad_norm": 1.4397725924431397, |
|
"learning_rate": 3.072756464904006e-05, |
|
"loss": 1.4309, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 7.5, |
|
"eval_loss": 2.0641307830810547, |
|
"eval_runtime": 3.0002, |
|
"eval_samples_per_second": 6.999, |
|
"eval_steps_per_second": 1.0, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 7.62, |
|
"grad_norm": 1.084317322881849, |
|
"learning_rate": 2.7879755265618555e-05, |
|
"loss": 1.4057, |
|
"step": 61 |
|
}, |
|
{ |
|
"epoch": 7.75, |
|
"grad_norm": 0.8921847488708302, |
|
"learning_rate": 2.514892518288988e-05, |
|
"loss": 1.1622, |
|
"step": 62 |
|
}, |
|
{ |
|
"epoch": 7.75, |
|
"eval_loss": 2.0632762908935547, |
|
"eval_runtime": 2.9934, |
|
"eval_samples_per_second": 7.015, |
|
"eval_steps_per_second": 1.002, |
|
"step": 62 |
|
}, |
|
{ |
|
"epoch": 7.88, |
|
"grad_norm": 1.2733235220422945, |
|
"learning_rate": 2.2539503817234553e-05, |
|
"loss": 1.2667, |
|
"step": 63 |
|
}, |
|
{ |
|
"epoch": 8.0, |
|
"grad_norm": 1.01591405423162, |
|
"learning_rate": 2.0055723659649904e-05, |
|
"loss": 1.228, |
|
"step": 64 |
|
}, |
|
{ |
|
"epoch": 8.0, |
|
"eval_loss": 2.09301495552063, |
|
"eval_runtime": 2.9938, |
|
"eval_samples_per_second": 7.014, |
|
"eval_steps_per_second": 1.002, |
|
"step": 64 |
|
}, |
|
{ |
|
"epoch": 8.12, |
|
"grad_norm": 0.9494450303367244, |
|
"learning_rate": 1.7701613410634365e-05, |
|
"loss": 1.1147, |
|
"step": 65 |
|
}, |
|
{ |
|
"epoch": 8.25, |
|
"grad_norm": 0.8254286577206483, |
|
"learning_rate": 1.5480991445620542e-05, |
|
"loss": 1.3076, |
|
"step": 66 |
|
}, |
|
{ |
|
"epoch": 8.25, |
|
"eval_loss": 2.1076860427856445, |
|
"eval_runtime": 2.9974, |
|
"eval_samples_per_second": 7.006, |
|
"eval_steps_per_second": 1.001, |
|
"step": 66 |
|
}, |
|
{ |
|
"epoch": 8.38, |
|
"grad_norm": 0.9874923331530434, |
|
"learning_rate": 1.339745962155613e-05, |
|
"loss": 1.1572, |
|
"step": 67 |
|
}, |
|
{ |
|
"epoch": 8.5, |
|
"grad_norm": 0.8701092754993289, |
|
"learning_rate": 1.1454397434679021e-05, |
|
"loss": 1.2323, |
|
"step": 68 |
|
}, |
|
{ |
|
"epoch": 8.5, |
|
"eval_loss": 2.1060104370117188, |
|
"eval_runtime": 2.9923, |
|
"eval_samples_per_second": 7.018, |
|
"eval_steps_per_second": 1.003, |
|
"step": 68 |
|
}, |
|
{ |
|
"epoch": 8.62, |
|
"grad_norm": 0.9048894666644874, |
|
"learning_rate": 9.65495653896179e-06, |
|
"loss": 1.1888, |
|
"step": 69 |
|
}, |
|
{ |
|
"epoch": 8.75, |
|
"grad_norm": 0.8899151834513122, |
|
"learning_rate": 8.002055634117578e-06, |
|
"loss": 1.1635, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 8.75, |
|
"eval_loss": 2.1039013862609863, |
|
"eval_runtime": 2.9883, |
|
"eval_samples_per_second": 7.027, |
|
"eval_steps_per_second": 1.004, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 8.88, |
|
"grad_norm": 0.9759646607551775, |
|
"learning_rate": 6.498375731458528e-06, |
|
"loss": 1.0924, |
|
"step": 71 |
|
}, |
|
{ |
|
"epoch": 9.0, |
|
"grad_norm": 1.067529387326401, |
|
"learning_rate": 5.146355805285452e-06, |
|
"loss": 1.261, |
|
"step": 72 |
|
}, |
|
{ |
|
"epoch": 9.0, |
|
"eval_loss": 2.1068060398101807, |
|
"eval_runtime": 2.9995, |
|
"eval_samples_per_second": 7.001, |
|
"eval_steps_per_second": 1.0, |
|
"step": 72 |
|
}, |
|
{ |
|
"epoch": 9.12, |
|
"grad_norm": 0.6912625265797223, |
|
"learning_rate": 3.948188836862776e-06, |
|
"loss": 1.2225, |
|
"step": 73 |
|
}, |
|
{ |
|
"epoch": 9.25, |
|
"grad_norm": 0.8624797224342494, |
|
"learning_rate": 2.905818257394799e-06, |
|
"loss": 1.0122, |
|
"step": 74 |
|
}, |
|
{ |
|
"epoch": 9.25, |
|
"eval_loss": 2.1110289096832275, |
|
"eval_runtime": 2.9973, |
|
"eval_samples_per_second": 7.006, |
|
"eval_steps_per_second": 1.001, |
|
"step": 74 |
|
}, |
|
{ |
|
"epoch": 9.38, |
|
"grad_norm": 0.7882237650664056, |
|
"learning_rate": 2.0209347957732328e-06, |
|
"loss": 1.0959, |
|
"step": 75 |
|
}, |
|
{ |
|
"epoch": 9.5, |
|
"grad_norm": 0.8572353081855683, |
|
"learning_rate": 1.2949737362087156e-06, |
|
"loss": 1.218, |
|
"step": 76 |
|
}, |
|
{ |
|
"epoch": 9.5, |
|
"eval_loss": 2.117999315261841, |
|
"eval_runtime": 2.9874, |
|
"eval_samples_per_second": 7.03, |
|
"eval_steps_per_second": 1.004, |
|
"step": 76 |
|
}, |
|
{ |
|
"epoch": 9.62, |
|
"grad_norm": 0.8712624014542376, |
|
"learning_rate": 7.291125901946027e-07, |
|
"loss": 1.2579, |
|
"step": 77 |
|
}, |
|
{ |
|
"epoch": 9.75, |
|
"grad_norm": 0.7643303727279644, |
|
"learning_rate": 3.2426918657900704e-07, |
|
"loss": 1.1022, |
|
"step": 78 |
|
}, |
|
{ |
|
"epoch": 9.75, |
|
"eval_loss": 2.1226046085357666, |
|
"eval_runtime": 2.9845, |
|
"eval_samples_per_second": 7.036, |
|
"eval_steps_per_second": 1.005, |
|
"step": 78 |
|
}, |
|
{ |
|
"epoch": 9.88, |
|
"grad_norm": 0.7335554379014946, |
|
"learning_rate": 8.110018284304133e-08, |
|
"loss": 1.1355, |
|
"step": 79 |
|
}, |
|
{ |
|
"epoch": 10.0, |
|
"grad_norm": 0.7141910762762422, |
|
"learning_rate": 0.0, |
|
"loss": 1.2072, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 10.0, |
|
"eval_loss": 2.1142799854278564, |
|
"eval_runtime": 2.9807, |
|
"eval_samples_per_second": 7.045, |
|
"eval_steps_per_second": 1.006, |
|
"step": 80 |
|
} |
|
], |
|
"logging_steps": 1, |
|
"max_steps": 80, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 10, |
|
"save_steps": 8, |
|
"total_flos": 3.704687606680781e+17, |
|
"train_batch_size": 2, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|