{ "best_metric": 1.9750508069992065, "best_model_checkpoint": "./gemma-python/checkpoint-40", "epoch": 10.0, "eval_steps": 2, "global_step": 80, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.12, "grad_norm": 40.636978402335416, "learning_rate": 0.0001, "loss": 19.0016, "step": 1 }, { "epoch": 0.12, "eval_loss": 18.6992130279541, "eval_runtime": 2.881, "eval_samples_per_second": 7.289, "eval_steps_per_second": 1.041, "step": 1 }, { "epoch": 0.25, "grad_norm": 41.61053527062362, "learning_rate": 0.0002, "loss": 19.4686, "step": 2 }, { "epoch": 0.25, "eval_loss": 16.257802963256836, "eval_runtime": 2.9111, "eval_samples_per_second": 7.214, "eval_steps_per_second": 1.031, "step": 2 }, { "epoch": 0.38, "grad_norm": 28.704819713850974, "learning_rate": 0.00019991889981715698, "loss": 13.2303, "step": 3 }, { "epoch": 0.5, "grad_norm": 26.40444243073739, "learning_rate": 0.00019967573081342103, "loss": 11.468, "step": 4 }, { "epoch": 0.5, "eval_loss": 8.28911018371582, "eval_runtime": 2.9257, "eval_samples_per_second": 7.178, "eval_steps_per_second": 1.025, "step": 4 }, { "epoch": 0.62, "grad_norm": 12.912981323843146, "learning_rate": 0.0001992708874098054, "loss": 9.3107, "step": 5 }, { "epoch": 0.75, "grad_norm": 7.943058500648636, "learning_rate": 0.00019870502626379127, "loss": 7.5305, "step": 6 }, { "epoch": 0.75, "eval_loss": 5.884701728820801, "eval_runtime": 2.9479, "eval_samples_per_second": 7.124, "eval_steps_per_second": 1.018, "step": 6 }, { "epoch": 0.88, "grad_norm": 6.267657551985817, "learning_rate": 0.00019797906520422677, "loss": 6.6492, "step": 7 }, { "epoch": 1.0, "grad_norm": 5.0825555341832365, "learning_rate": 0.0001970941817426052, "loss": 5.7572, "step": 8 }, { "epoch": 1.0, "eval_loss": 4.363473892211914, "eval_runtime": 2.9653, "eval_samples_per_second": 7.082, "eval_steps_per_second": 1.012, "step": 8 }, { "epoch": 1.12, "grad_norm": 4.88565620317727, "learning_rate": 0.00019605181116313724, "loss": 4.5414, "step": 9 }, { "epoch": 1.25, "grad_norm": 5.0847008955317605, "learning_rate": 0.00019485364419471454, "loss": 4.3903, "step": 10 }, { "epoch": 1.25, "eval_loss": 3.284867763519287, "eval_runtime": 2.9746, "eval_samples_per_second": 7.06, "eval_steps_per_second": 1.009, "step": 10 }, { "epoch": 1.38, "grad_norm": 3.424587898800574, "learning_rate": 0.0001935016242685415, "loss": 3.79, "step": 11 }, { "epoch": 1.5, "grad_norm": 2.7255824385278506, "learning_rate": 0.00019199794436588243, "loss": 2.9497, "step": 12 }, { "epoch": 1.5, "eval_loss": 2.853942394256592, "eval_runtime": 2.9866, "eval_samples_per_second": 7.031, "eval_steps_per_second": 1.004, "step": 12 }, { "epoch": 1.62, "grad_norm": 2.1001906898750624, "learning_rate": 0.00019034504346103823, "loss": 2.7728, "step": 13 }, { "epoch": 1.75, "grad_norm": 1.9200021565941778, "learning_rate": 0.000188545602565321, "loss": 2.8738, "step": 14 }, { "epoch": 1.75, "eval_loss": 2.62028431892395, "eval_runtime": 2.9982, "eval_samples_per_second": 7.004, "eval_steps_per_second": 1.001, "step": 14 }, { "epoch": 1.88, "grad_norm": 1.8837224890225774, "learning_rate": 0.00018660254037844388, "loss": 3.0787, "step": 15 }, { "epoch": 2.0, "grad_norm": 1.8929687978608318, "learning_rate": 0.0001845190085543795, "loss": 2.7298, "step": 16 }, { "epoch": 2.0, "eval_loss": 2.453444242477417, "eval_runtime": 2.9964, "eval_samples_per_second": 7.008, "eval_steps_per_second": 1.001, "step": 16 }, { "epoch": 2.12, "grad_norm": 1.3652069569291694, "learning_rate": 0.00018229838658936564, "loss": 2.5967, "step": 17 }, { "epoch": 2.25, "grad_norm": 2.4263600812149417, "learning_rate": 0.00017994427634035015, "loss": 2.4284, "step": 18 }, { "epoch": 2.25, "eval_loss": 2.307706832885742, "eval_runtime": 2.9963, "eval_samples_per_second": 7.009, "eval_steps_per_second": 1.001, "step": 18 }, { "epoch": 2.38, "grad_norm": 2.5673391658400053, "learning_rate": 0.00017746049618276545, "loss": 2.6721, "step": 19 }, { "epoch": 2.5, "grad_norm": 2.2252437500899656, "learning_rate": 0.00017485107481711012, "loss": 2.394, "step": 20 }, { "epoch": 2.5, "eval_loss": 2.187636137008667, "eval_runtime": 2.9975, "eval_samples_per_second": 7.006, "eval_steps_per_second": 1.001, "step": 20 }, { "epoch": 2.62, "grad_norm": 2.345233295279928, "learning_rate": 0.00017212024473438147, "loss": 2.3972, "step": 21 }, { "epoch": 2.75, "grad_norm": 1.1122620317353238, "learning_rate": 0.00016927243535095997, "loss": 2.069, "step": 22 }, { "epoch": 2.75, "eval_loss": 2.1294100284576416, "eval_runtime": 2.993, "eval_samples_per_second": 7.016, "eval_steps_per_second": 1.002, "step": 22 }, { "epoch": 2.88, "grad_norm": 2.8270209249093803, "learning_rate": 0.00016631226582407952, "loss": 2.211, "step": 23 }, { "epoch": 3.0, "grad_norm": 7.323169716541166, "learning_rate": 0.00016324453755953773, "loss": 1.9355, "step": 24 }, { "epoch": 3.0, "eval_loss": 2.1047682762145996, "eval_runtime": 2.9871, "eval_samples_per_second": 7.03, "eval_steps_per_second": 1.004, "step": 24 }, { "epoch": 3.12, "grad_norm": 1.9938311808450486, "learning_rate": 0.0001600742264237979, "loss": 2.1962, "step": 25 }, { "epoch": 3.25, "grad_norm": 3.330986691029466, "learning_rate": 0.00015680647467311557, "loss": 1.9635, "step": 26 }, { "epoch": 3.25, "eval_loss": 2.0707101821899414, "eval_runtime": 2.9895, "eval_samples_per_second": 7.025, "eval_steps_per_second": 1.004, "step": 26 }, { "epoch": 3.38, "grad_norm": 2.0371854480792178, "learning_rate": 0.0001534465826127801, "loss": 2.2319, "step": 27 }, { "epoch": 3.5, "grad_norm": 3.2163831286077653, "learning_rate": 0.00015000000000000001, "loss": 2.092, "step": 28 }, { "epoch": 3.5, "eval_loss": 2.059619426727295, "eval_runtime": 2.9996, "eval_samples_per_second": 7.001, "eval_steps_per_second": 1.0, "step": 28 }, { "epoch": 3.62, "grad_norm": 2.853987323853131, "learning_rate": 0.00014647231720437686, "loss": 1.9182, "step": 29 }, { "epoch": 3.75, "grad_norm": 2.2997509863024352, "learning_rate": 0.00014286925614030542, "loss": 1.9675, "step": 30 }, { "epoch": 3.75, "eval_loss": 2.0287458896636963, "eval_runtime": 2.9966, "eval_samples_per_second": 7.008, "eval_steps_per_second": 1.001, "step": 30 }, { "epoch": 3.88, "grad_norm": 2.2770679758385244, "learning_rate": 0.00013919666098600753, "loss": 1.9815, "step": 31 }, { "epoch": 4.0, "grad_norm": 0.8553765652252152, "learning_rate": 0.00013546048870425356, "loss": 1.9693, "step": 32 }, { "epoch": 4.0, "eval_loss": 2.022012710571289, "eval_runtime": 2.9895, "eval_samples_per_second": 7.025, "eval_steps_per_second": 1.004, "step": 32 }, { "epoch": 4.12, "grad_norm": 3.8094922067262336, "learning_rate": 0.00013166679938014726, "loss": 1.6479, "step": 33 }, { "epoch": 4.25, "grad_norm": 3.5435911597121277, "learning_rate": 0.0001278217463916453, "loss": 2.0198, "step": 34 }, { "epoch": 4.25, "eval_loss": 2.012432336807251, "eval_runtime": 2.9987, "eval_samples_per_second": 7.003, "eval_steps_per_second": 1.0, "step": 34 }, { "epoch": 4.38, "grad_norm": 1.4676241516417539, "learning_rate": 0.0001239315664287558, "loss": 1.7496, "step": 35 }, { "epoch": 4.5, "grad_norm": 1.4772602834377506, "learning_rate": 0.00012000256937760445, "loss": 1.9357, "step": 36 }, { "epoch": 4.5, "eval_loss": 1.9945744276046753, "eval_runtime": 3.0019, "eval_samples_per_second": 6.995, "eval_steps_per_second": 0.999, "step": 36 }, { "epoch": 4.62, "grad_norm": 0.8198622785029981, "learning_rate": 0.00011604112808577603, "loss": 1.8365, "step": 37 }, { "epoch": 4.75, "grad_norm": 2.5267989029749556, "learning_rate": 0.0001120536680255323, "loss": 1.8147, "step": 38 }, { "epoch": 4.75, "eval_loss": 1.9979486465454102, "eval_runtime": 2.9865, "eval_samples_per_second": 7.032, "eval_steps_per_second": 1.005, "step": 38 }, { "epoch": 4.88, "grad_norm": 1.2889515222114942, "learning_rate": 0.00010804665687167262, "loss": 1.6703, "step": 39 }, { "epoch": 5.0, "grad_norm": 1.3474067788797102, "learning_rate": 0.00010402659401094152, "loss": 1.9084, "step": 40 }, { "epoch": 5.0, "eval_loss": 1.9750508069992065, "eval_runtime": 2.9945, "eval_samples_per_second": 7.013, "eval_steps_per_second": 1.002, "step": 40 }, { "epoch": 5.12, "grad_norm": 1.320063776368443, "learning_rate": 0.0001, "loss": 1.6233, "step": 41 }, { "epoch": 5.25, "grad_norm": 0.7858628087737163, "learning_rate": 9.597340598905852e-05, "loss": 1.6678, "step": 42 }, { "epoch": 5.25, "eval_loss": 2.004897356033325, "eval_runtime": 2.9946, "eval_samples_per_second": 7.013, "eval_steps_per_second": 1.002, "step": 42 }, { "epoch": 5.38, "grad_norm": 1.149181462350102, "learning_rate": 9.195334312832742e-05, "loss": 1.5673, "step": 43 }, { "epoch": 5.5, "grad_norm": 1.961547695831496, "learning_rate": 8.79463319744677e-05, "loss": 1.7639, "step": 44 }, { "epoch": 5.5, "eval_loss": 1.9885122776031494, "eval_runtime": 2.9905, "eval_samples_per_second": 7.022, "eval_steps_per_second": 1.003, "step": 44 }, { "epoch": 5.62, "grad_norm": 0.794217334050356, "learning_rate": 8.395887191422397e-05, "loss": 1.6191, "step": 45 }, { "epoch": 5.75, "grad_norm": 1.5568588659062292, "learning_rate": 7.999743062239557e-05, "loss": 1.7475, "step": 46 }, { "epoch": 5.75, "eval_loss": 1.9777300357818604, "eval_runtime": 2.9821, "eval_samples_per_second": 7.042, "eval_steps_per_second": 1.006, "step": 46 }, { "epoch": 5.88, "grad_norm": 0.9110203190054421, "learning_rate": 7.606843357124426e-05, "loss": 1.5998, "step": 47 }, { "epoch": 6.0, "grad_norm": 1.4501990937976796, "learning_rate": 7.217825360835473e-05, "loss": 1.4848, "step": 48 }, { "epoch": 6.0, "eval_loss": 1.9939006567001343, "eval_runtime": 2.9785, "eval_samples_per_second": 7.05, "eval_steps_per_second": 1.007, "step": 48 }, { "epoch": 6.12, "grad_norm": 1.3413384555399062, "learning_rate": 6.833320061985277e-05, "loss": 1.5343, "step": 49 }, { "epoch": 6.25, "grad_norm": 0.9844954583473513, "learning_rate": 6.453951129574644e-05, "loss": 1.3065, "step": 50 }, { "epoch": 6.25, "eval_loss": 2.0264320373535156, "eval_runtime": 2.9839, "eval_samples_per_second": 7.038, "eval_steps_per_second": 1.005, "step": 50 }, { "epoch": 6.38, "grad_norm": 1.268663878876962, "learning_rate": 6.080333901399251e-05, "loss": 1.4153, "step": 51 }, { "epoch": 6.5, "grad_norm": 1.1638516740810099, "learning_rate": 5.713074385969457e-05, "loss": 1.4792, "step": 52 }, { "epoch": 6.5, "eval_loss": 2.012540817260742, "eval_runtime": 2.9954, "eval_samples_per_second": 7.011, "eval_steps_per_second": 1.002, "step": 52 }, { "epoch": 6.62, "grad_norm": 0.8956974540095054, "learning_rate": 5.3527682795623146e-05, "loss": 1.5184, "step": 53 }, { "epoch": 6.75, "grad_norm": 0.8166104294104601, "learning_rate": 5.000000000000002e-05, "loss": 1.4233, "step": 54 }, { "epoch": 6.75, "eval_loss": 2.0203704833984375, "eval_runtime": 2.9966, "eval_samples_per_second": 7.008, "eval_steps_per_second": 1.001, "step": 54 }, { "epoch": 6.88, "grad_norm": 1.2567309830006292, "learning_rate": 4.6553417387219886e-05, "loss": 1.5766, "step": 55 }, { "epoch": 7.0, "grad_norm": 1.202021898168564, "learning_rate": 4.3193525326884435e-05, "loss": 1.2534, "step": 56 }, { "epoch": 7.0, "eval_loss": 2.0317745208740234, "eval_runtime": 2.9887, "eval_samples_per_second": 7.027, "eval_steps_per_second": 1.004, "step": 56 }, { "epoch": 7.12, "grad_norm": 1.0179404054971375, "learning_rate": 3.99257735762021e-05, "loss": 1.3538, "step": 57 }, { "epoch": 7.25, "grad_norm": 0.8024465225797554, "learning_rate": 3.675546244046228e-05, "loss": 1.2409, "step": 58 }, { "epoch": 7.25, "eval_loss": 2.0444860458374023, "eval_runtime": 2.9957, "eval_samples_per_second": 7.01, "eval_steps_per_second": 1.001, "step": 58 }, { "epoch": 7.38, "grad_norm": 1.0938821440297672, "learning_rate": 3.36877341759205e-05, "loss": 1.2446, "step": 59 }, { "epoch": 7.5, "grad_norm": 1.4397725924431397, "learning_rate": 3.072756464904006e-05, "loss": 1.4309, "step": 60 }, { "epoch": 7.5, "eval_loss": 2.0641307830810547, "eval_runtime": 3.0002, "eval_samples_per_second": 6.999, "eval_steps_per_second": 1.0, "step": 60 }, { "epoch": 7.62, "grad_norm": 1.084317322881849, "learning_rate": 2.7879755265618555e-05, "loss": 1.4057, "step": 61 }, { "epoch": 7.75, "grad_norm": 0.8921847488708302, "learning_rate": 2.514892518288988e-05, "loss": 1.1622, "step": 62 }, { "epoch": 7.75, "eval_loss": 2.0632762908935547, "eval_runtime": 2.9934, "eval_samples_per_second": 7.015, "eval_steps_per_second": 1.002, "step": 62 }, { "epoch": 7.88, "grad_norm": 1.2733235220422945, "learning_rate": 2.2539503817234553e-05, "loss": 1.2667, "step": 63 }, { "epoch": 8.0, "grad_norm": 1.01591405423162, "learning_rate": 2.0055723659649904e-05, "loss": 1.228, "step": 64 }, { "epoch": 8.0, "eval_loss": 2.09301495552063, "eval_runtime": 2.9938, "eval_samples_per_second": 7.014, "eval_steps_per_second": 1.002, "step": 64 }, { "epoch": 8.12, "grad_norm": 0.9494450303367244, "learning_rate": 1.7701613410634365e-05, "loss": 1.1147, "step": 65 }, { "epoch": 8.25, "grad_norm": 0.8254286577206483, "learning_rate": 1.5480991445620542e-05, "loss": 1.3076, "step": 66 }, { "epoch": 8.25, "eval_loss": 2.1076860427856445, "eval_runtime": 2.9974, "eval_samples_per_second": 7.006, "eval_steps_per_second": 1.001, "step": 66 }, { "epoch": 8.38, "grad_norm": 0.9874923331530434, "learning_rate": 1.339745962155613e-05, "loss": 1.1572, "step": 67 }, { "epoch": 8.5, "grad_norm": 0.8701092754993289, "learning_rate": 1.1454397434679021e-05, "loss": 1.2323, "step": 68 }, { "epoch": 8.5, "eval_loss": 2.1060104370117188, "eval_runtime": 2.9923, "eval_samples_per_second": 7.018, "eval_steps_per_second": 1.003, "step": 68 }, { "epoch": 8.62, "grad_norm": 0.9048894666644874, "learning_rate": 9.65495653896179e-06, "loss": 1.1888, "step": 69 }, { "epoch": 8.75, "grad_norm": 0.8899151834513122, "learning_rate": 8.002055634117578e-06, "loss": 1.1635, "step": 70 }, { "epoch": 8.75, "eval_loss": 2.1039013862609863, "eval_runtime": 2.9883, "eval_samples_per_second": 7.027, "eval_steps_per_second": 1.004, "step": 70 }, { "epoch": 8.88, "grad_norm": 0.9759646607551775, "learning_rate": 6.498375731458528e-06, "loss": 1.0924, "step": 71 }, { "epoch": 9.0, "grad_norm": 1.067529387326401, "learning_rate": 5.146355805285452e-06, "loss": 1.261, "step": 72 }, { "epoch": 9.0, "eval_loss": 2.1068060398101807, "eval_runtime": 2.9995, "eval_samples_per_second": 7.001, "eval_steps_per_second": 1.0, "step": 72 }, { "epoch": 9.12, "grad_norm": 0.6912625265797223, "learning_rate": 3.948188836862776e-06, "loss": 1.2225, "step": 73 }, { "epoch": 9.25, "grad_norm": 0.8624797224342494, "learning_rate": 2.905818257394799e-06, "loss": 1.0122, "step": 74 }, { "epoch": 9.25, "eval_loss": 2.1110289096832275, "eval_runtime": 2.9973, "eval_samples_per_second": 7.006, "eval_steps_per_second": 1.001, "step": 74 }, { "epoch": 9.38, "grad_norm": 0.7882237650664056, "learning_rate": 2.0209347957732328e-06, "loss": 1.0959, "step": 75 }, { "epoch": 9.5, "grad_norm": 0.8572353081855683, "learning_rate": 1.2949737362087156e-06, "loss": 1.218, "step": 76 }, { "epoch": 9.5, "eval_loss": 2.117999315261841, "eval_runtime": 2.9874, "eval_samples_per_second": 7.03, "eval_steps_per_second": 1.004, "step": 76 }, { "epoch": 9.62, "grad_norm": 0.8712624014542376, "learning_rate": 7.291125901946027e-07, "loss": 1.2579, "step": 77 }, { "epoch": 9.75, "grad_norm": 0.7643303727279644, "learning_rate": 3.2426918657900704e-07, "loss": 1.1022, "step": 78 }, { "epoch": 9.75, "eval_loss": 2.1226046085357666, "eval_runtime": 2.9845, "eval_samples_per_second": 7.036, "eval_steps_per_second": 1.005, "step": 78 }, { "epoch": 9.88, "grad_norm": 0.7335554379014946, "learning_rate": 8.110018284304133e-08, "loss": 1.1355, "step": 79 }, { "epoch": 10.0, "grad_norm": 0.7141910762762422, "learning_rate": 0.0, "loss": 1.2072, "step": 80 }, { "epoch": 10.0, "eval_loss": 2.1142799854278564, "eval_runtime": 2.9807, "eval_samples_per_second": 7.045, "eval_steps_per_second": 1.006, "step": 80 } ], "logging_steps": 1, "max_steps": 80, "num_input_tokens_seen": 0, "num_train_epochs": 10, "save_steps": 8, "total_flos": 3.704687606680781e+17, "train_batch_size": 2, "trial_name": null, "trial_params": null }