{ "best_metric": null, "best_model_checkpoint": null, "epoch": 3.0, "eval_steps": 3, "global_step": 282, "is_hyper_param_search": true, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.031914893617021274, "grad_norm": 212.81973266601562, "learning_rate": 3.7450394203144474e-05, "loss": 2.2611, "step": 3 }, { "epoch": 0.031914893617021274, "eval_loss": 1.6954327821731567, "eval_runtime": 90.1393, "eval_samples_per_second": 3.328, "eval_steps_per_second": 0.111, "step": 3 }, { "epoch": 0.06382978723404255, "grad_norm": 94.31295776367188, "learning_rate": 3.704770179235797e-05, "loss": 2.1851, "step": 6 }, { "epoch": 0.06382978723404255, "eval_loss": 1.83690345287323, "eval_runtime": 45.0365, "eval_samples_per_second": 6.661, "eval_steps_per_second": 0.222, "step": 6 }, { "epoch": 0.09574468085106383, "grad_norm": 88.24703216552734, "learning_rate": 3.6645009381571474e-05, "loss": 1.9095, "step": 9 }, { "epoch": 0.09574468085106383, "eval_loss": 1.8263834714889526, "eval_runtime": 44.4544, "eval_samples_per_second": 6.748, "eval_steps_per_second": 0.225, "step": 9 }, { "epoch": 0.1276595744680851, "grad_norm": 73.16114044189453, "learning_rate": 3.6242316970784977e-05, "loss": 1.6022, "step": 12 }, { "epoch": 0.1276595744680851, "eval_loss": 1.6566990613937378, "eval_runtime": 45.0842, "eval_samples_per_second": 6.654, "eval_steps_per_second": 0.222, "step": 12 }, { "epoch": 0.1595744680851064, "grad_norm": 74.73725128173828, "learning_rate": 3.583962455999847e-05, "loss": 1.5181, "step": 15 }, { "epoch": 0.1595744680851064, "eval_loss": 1.6280070543289185, "eval_runtime": 45.1381, "eval_samples_per_second": 6.646, "eval_steps_per_second": 0.222, "step": 15 }, { "epoch": 0.19148936170212766, "grad_norm": 79.7146987915039, "learning_rate": 3.5436932149211976e-05, "loss": 1.8478, "step": 18 }, { "epoch": 0.19148936170212766, "eval_loss": 1.5796161890029907, "eval_runtime": 45.9387, "eval_samples_per_second": 6.53, "eval_steps_per_second": 0.218, "step": 18 }, { "epoch": 0.22340425531914893, "grad_norm": 65.3478775024414, "learning_rate": 3.503423973842547e-05, "loss": 1.3884, "step": 21 }, { "epoch": 0.22340425531914893, "eval_loss": 1.5002487897872925, "eval_runtime": 45.7227, "eval_samples_per_second": 6.561, "eval_steps_per_second": 0.219, "step": 21 }, { "epoch": 0.2553191489361702, "grad_norm": 66.32596588134766, "learning_rate": 3.4631547327638975e-05, "loss": 1.4368, "step": 24 }, { "epoch": 0.2553191489361702, "eval_loss": 1.5191569328308105, "eval_runtime": 44.4497, "eval_samples_per_second": 6.749, "eval_steps_per_second": 0.225, "step": 24 }, { "epoch": 0.2872340425531915, "grad_norm": 67.5133056640625, "learning_rate": 3.422885491685248e-05, "loss": 1.6515, "step": 27 }, { "epoch": 0.2872340425531915, "eval_loss": 1.4516819715499878, "eval_runtime": 44.968, "eval_samples_per_second": 6.671, "eval_steps_per_second": 0.222, "step": 27 }, { "epoch": 0.3191489361702128, "grad_norm": 56.178218841552734, "learning_rate": 3.3826162506065975e-05, "loss": 1.5518, "step": 30 }, { "epoch": 0.3191489361702128, "eval_loss": 1.4015744924545288, "eval_runtime": 44.5216, "eval_samples_per_second": 6.738, "eval_steps_per_second": 0.225, "step": 30 }, { "epoch": 0.35106382978723405, "grad_norm": 62.902313232421875, "learning_rate": 3.342347009527948e-05, "loss": 1.339, "step": 33 }, { "epoch": 0.35106382978723405, "eval_loss": 1.3566683530807495, "eval_runtime": 44.9809, "eval_samples_per_second": 6.669, "eval_steps_per_second": 0.222, "step": 33 }, { "epoch": 0.3829787234042553, "grad_norm": 57.751197814941406, "learning_rate": 3.3020777684492974e-05, "loss": 1.6338, "step": 36 }, { "epoch": 0.3829787234042553, "eval_loss": 1.396578073501587, "eval_runtime": 45.3817, "eval_samples_per_second": 6.611, "eval_steps_per_second": 0.22, "step": 36 }, { "epoch": 0.4148936170212766, "grad_norm": 57.7042121887207, "learning_rate": 3.261808527370648e-05, "loss": 1.4372, "step": 39 }, { "epoch": 0.4148936170212766, "eval_loss": 1.3969274759292603, "eval_runtime": 63.0438, "eval_samples_per_second": 4.759, "eval_steps_per_second": 0.159, "step": 39 }, { "epoch": 0.44680851063829785, "grad_norm": 43.087100982666016, "learning_rate": 3.221539286291997e-05, "loss": 1.2751, "step": 42 }, { "epoch": 0.44680851063829785, "eval_loss": 1.3432629108428955, "eval_runtime": 43.0776, "eval_samples_per_second": 6.964, "eval_steps_per_second": 0.232, "step": 42 }, { "epoch": 0.4787234042553192, "grad_norm": 58.544620513916016, "learning_rate": 3.1812700452133476e-05, "loss": 1.484, "step": 45 }, { "epoch": 0.4787234042553192, "eval_loss": 1.389102578163147, "eval_runtime": 42.2187, "eval_samples_per_second": 7.106, "eval_steps_per_second": 0.237, "step": 45 }, { "epoch": 0.5106382978723404, "grad_norm": 40.23255920410156, "learning_rate": 3.141000804134697e-05, "loss": 1.4936, "step": 48 }, { "epoch": 0.5106382978723404, "eval_loss": 1.408040165901184, "eval_runtime": 42.8634, "eval_samples_per_second": 6.999, "eval_steps_per_second": 0.233, "step": 48 }, { "epoch": 0.5425531914893617, "grad_norm": 58.45737075805664, "learning_rate": 3.1007315630560475e-05, "loss": 1.6771, "step": 51 }, { "epoch": 0.5425531914893617, "eval_loss": 1.3876069784164429, "eval_runtime": 47.7351, "eval_samples_per_second": 6.285, "eval_steps_per_second": 0.209, "step": 51 }, { "epoch": 0.574468085106383, "grad_norm": 59.58686447143555, "learning_rate": 3.060462321977397e-05, "loss": 1.7328, "step": 54 }, { "epoch": 0.574468085106383, "eval_loss": 1.3280422687530518, "eval_runtime": 45.1191, "eval_samples_per_second": 6.649, "eval_steps_per_second": 0.222, "step": 54 }, { "epoch": 0.6063829787234043, "grad_norm": 49.796451568603516, "learning_rate": 3.0201930808987475e-05, "loss": 1.3566, "step": 57 }, { "epoch": 0.6063829787234043, "eval_loss": 1.3299312591552734, "eval_runtime": 45.4372, "eval_samples_per_second": 6.603, "eval_steps_per_second": 0.22, "step": 57 }, { "epoch": 0.6382978723404256, "grad_norm": 47.63386154174805, "learning_rate": 2.9799238398200978e-05, "loss": 1.1698, "step": 60 }, { "epoch": 0.6382978723404256, "eval_loss": 1.3619319200515747, "eval_runtime": 44.5898, "eval_samples_per_second": 6.728, "eval_steps_per_second": 0.224, "step": 60 }, { "epoch": 0.6702127659574468, "grad_norm": 51.1750602722168, "learning_rate": 2.9396545987414478e-05, "loss": 1.4553, "step": 63 }, { "epoch": 0.6702127659574468, "eval_loss": 1.2944597005844116, "eval_runtime": 44.0116, "eval_samples_per_second": 6.816, "eval_steps_per_second": 0.227, "step": 63 }, { "epoch": 0.7021276595744681, "grad_norm": 56.35239028930664, "learning_rate": 2.8993853576627977e-05, "loss": 1.4532, "step": 66 }, { "epoch": 0.7021276595744681, "eval_loss": 1.3396683931350708, "eval_runtime": 57.5395, "eval_samples_per_second": 5.214, "eval_steps_per_second": 0.174, "step": 66 }, { "epoch": 0.7340425531914894, "grad_norm": 55.66822052001953, "learning_rate": 2.859116116584148e-05, "loss": 1.5623, "step": 69 }, { "epoch": 0.7340425531914894, "eval_loss": 1.3019959926605225, "eval_runtime": 42.7505, "eval_samples_per_second": 7.017, "eval_steps_per_second": 0.234, "step": 69 }, { "epoch": 0.7659574468085106, "grad_norm": 83.2627944946289, "learning_rate": 2.8188468755054976e-05, "loss": 1.5575, "step": 72 }, { "epoch": 0.7659574468085106, "eval_loss": 1.2901865243911743, "eval_runtime": 42.3414, "eval_samples_per_second": 7.085, "eval_steps_per_second": 0.236, "step": 72 }, { "epoch": 0.7978723404255319, "grad_norm": 66.81884002685547, "learning_rate": 2.778577634426848e-05, "loss": 1.4042, "step": 75 }, { "epoch": 0.7978723404255319, "eval_loss": 1.2550606727600098, "eval_runtime": 42.5955, "eval_samples_per_second": 7.043, "eval_steps_per_second": 0.235, "step": 75 }, { "epoch": 0.8297872340425532, "grad_norm": 47.388999938964844, "learning_rate": 2.738308393348198e-05, "loss": 1.3868, "step": 78 }, { "epoch": 0.8297872340425532, "eval_loss": 1.2457486391067505, "eval_runtime": 42.5244, "eval_samples_per_second": 7.055, "eval_steps_per_second": 0.235, "step": 78 }, { "epoch": 0.8617021276595744, "grad_norm": 51.87979507446289, "learning_rate": 2.698039152269548e-05, "loss": 1.3286, "step": 81 }, { "epoch": 0.8617021276595744, "eval_loss": 1.2376227378845215, "eval_runtime": 42.3825, "eval_samples_per_second": 7.078, "eval_steps_per_second": 0.236, "step": 81 }, { "epoch": 0.8936170212765957, "grad_norm": 61.56190490722656, "learning_rate": 2.6577699111908982e-05, "loss": 1.2435, "step": 84 }, { "epoch": 0.8936170212765957, "eval_loss": 1.1932413578033447, "eval_runtime": 43.0229, "eval_samples_per_second": 6.973, "eval_steps_per_second": 0.232, "step": 84 }, { "epoch": 0.925531914893617, "grad_norm": 56.688331604003906, "learning_rate": 2.617500670112248e-05, "loss": 1.1201, "step": 87 }, { "epoch": 0.925531914893617, "eval_loss": 1.1663594245910645, "eval_runtime": 42.6963, "eval_samples_per_second": 7.026, "eval_steps_per_second": 0.234, "step": 87 }, { "epoch": 0.9574468085106383, "grad_norm": 54.14950180053711, "learning_rate": 2.577231429033598e-05, "loss": 1.386, "step": 90 }, { "epoch": 0.9574468085106383, "eval_loss": 1.1541115045547485, "eval_runtime": 42.4377, "eval_samples_per_second": 7.069, "eval_steps_per_second": 0.236, "step": 90 }, { "epoch": 0.9893617021276596, "grad_norm": 63.0470085144043, "learning_rate": 2.536962187954948e-05, "loss": 1.0404, "step": 93 }, { "epoch": 0.9893617021276596, "eval_loss": 1.1485306024551392, "eval_runtime": 42.3281, "eval_samples_per_second": 7.087, "eval_steps_per_second": 0.236, "step": 93 }, { "epoch": 1.0212765957446808, "grad_norm": 29.81873321533203, "learning_rate": 2.496692946876298e-05, "loss": 0.7278, "step": 96 }, { "epoch": 1.0212765957446808, "eval_loss": 1.1454874277114868, "eval_runtime": 4.1684, "eval_samples_per_second": 71.97, "eval_steps_per_second": 2.399, "step": 96 }, { "epoch": 1.053191489361702, "grad_norm": 33.473297119140625, "learning_rate": 2.456423705797648e-05, "loss": 0.3543, "step": 99 }, { "epoch": 1.053191489361702, "eval_loss": 1.177135944366455, "eval_runtime": 42.0082, "eval_samples_per_second": 7.141, "eval_steps_per_second": 0.238, "step": 99 }, { "epoch": 1.0851063829787233, "grad_norm": 23.966768264770508, "learning_rate": 2.4161544647189983e-05, "loss": 0.2839, "step": 102 }, { "epoch": 1.0851063829787233, "eval_loss": 1.2594685554504395, "eval_runtime": 4.1814, "eval_samples_per_second": 71.746, "eval_steps_per_second": 2.392, "step": 102 }, { "epoch": 1.1170212765957448, "grad_norm": 23.973447799682617, "learning_rate": 2.375885223640348e-05, "loss": 0.2303, "step": 105 }, { "epoch": 1.1170212765957448, "eval_loss": 1.3750962018966675, "eval_runtime": 42.0396, "eval_samples_per_second": 7.136, "eval_steps_per_second": 0.238, "step": 105 }, { "epoch": 1.148936170212766, "grad_norm": 36.35573959350586, "learning_rate": 2.3356159825616983e-05, "loss": 0.3702, "step": 108 }, { "epoch": 1.148936170212766, "eval_loss": 1.4052996635437012, "eval_runtime": 4.1732, "eval_samples_per_second": 71.887, "eval_steps_per_second": 2.396, "step": 108 }, { "epoch": 1.1808510638297873, "grad_norm": 37.33869171142578, "learning_rate": 2.2953467414830486e-05, "loss": 0.37, "step": 111 }, { "epoch": 1.1808510638297873, "eval_loss": 1.4478391408920288, "eval_runtime": 42.2951, "eval_samples_per_second": 7.093, "eval_steps_per_second": 0.236, "step": 111 }, { "epoch": 1.2127659574468086, "grad_norm": 32.96900177001953, "learning_rate": 2.2550775004043982e-05, "loss": 0.2412, "step": 114 }, { "epoch": 1.2127659574468086, "eval_loss": 1.4774221181869507, "eval_runtime": 5.0258, "eval_samples_per_second": 59.693, "eval_steps_per_second": 1.99, "step": 114 }, { "epoch": 1.2446808510638299, "grad_norm": 45.11040496826172, "learning_rate": 2.2148082593257485e-05, "loss": 0.3028, "step": 117 }, { "epoch": 1.2446808510638299, "eval_loss": 1.4591237306594849, "eval_runtime": 41.5496, "eval_samples_per_second": 7.22, "eval_steps_per_second": 0.241, "step": 117 }, { "epoch": 1.2765957446808511, "grad_norm": 31.62494659423828, "learning_rate": 2.1745390182470985e-05, "loss": 0.322, "step": 120 }, { "epoch": 1.2765957446808511, "eval_loss": 1.483196496963501, "eval_runtime": 4.901, "eval_samples_per_second": 61.212, "eval_steps_per_second": 2.04, "step": 120 }, { "epoch": 1.3085106382978724, "grad_norm": 53.78076934814453, "learning_rate": 2.1342697771684484e-05, "loss": 0.4919, "step": 123 }, { "epoch": 1.3085106382978724, "eval_loss": 1.4939560890197754, "eval_runtime": 41.6747, "eval_samples_per_second": 7.199, "eval_steps_per_second": 0.24, "step": 123 }, { "epoch": 1.3404255319148937, "grad_norm": 54.71748352050781, "learning_rate": 2.0940005360897984e-05, "loss": 0.3868, "step": 126 }, { "epoch": 1.3404255319148937, "eval_loss": 1.4461126327514648, "eval_runtime": 4.3166, "eval_samples_per_second": 69.498, "eval_steps_per_second": 2.317, "step": 126 }, { "epoch": 1.372340425531915, "grad_norm": 39.04596710205078, "learning_rate": 2.0537312950111484e-05, "loss": 0.3795, "step": 129 }, { "epoch": 1.372340425531915, "eval_loss": 1.421697735786438, "eval_runtime": 41.8322, "eval_samples_per_second": 7.172, "eval_steps_per_second": 0.239, "step": 129 }, { "epoch": 1.4042553191489362, "grad_norm": 43.464900970458984, "learning_rate": 2.0134620539324983e-05, "loss": 0.2185, "step": 132 }, { "epoch": 1.4042553191489362, "eval_loss": 1.4404044151306152, "eval_runtime": 4.1662, "eval_samples_per_second": 72.008, "eval_steps_per_second": 2.4, "step": 132 }, { "epoch": 1.4361702127659575, "grad_norm": 39.858341217041016, "learning_rate": 1.9731928128538486e-05, "loss": 0.361, "step": 135 }, { "epoch": 1.4361702127659575, "eval_loss": 1.445768117904663, "eval_runtime": 41.5719, "eval_samples_per_second": 7.216, "eval_steps_per_second": 0.241, "step": 135 }, { "epoch": 1.4680851063829787, "grad_norm": 35.612701416015625, "learning_rate": 1.9329235717751983e-05, "loss": 0.3308, "step": 138 }, { "epoch": 1.4680851063829787, "eval_loss": 1.447102665901184, "eval_runtime": 4.2095, "eval_samples_per_second": 71.267, "eval_steps_per_second": 2.376, "step": 138 }, { "epoch": 1.5, "grad_norm": 40.08439636230469, "learning_rate": 1.8926543306965486e-05, "loss": 0.29, "step": 141 }, { "epoch": 1.5, "eval_loss": 1.4261672496795654, "eval_runtime": 41.3155, "eval_samples_per_second": 7.261, "eval_steps_per_second": 0.242, "step": 141 }, { "epoch": 1.5319148936170213, "grad_norm": 34.78931427001953, "learning_rate": 1.8523850896178985e-05, "loss": 0.2372, "step": 144 }, { "epoch": 1.5319148936170213, "eval_loss": 1.4144420623779297, "eval_runtime": 4.1971, "eval_samples_per_second": 71.478, "eval_steps_per_second": 2.383, "step": 144 }, { "epoch": 1.5638297872340425, "grad_norm": 45.00236892700195, "learning_rate": 1.8121158485392488e-05, "loss": 0.2666, "step": 147 }, { "epoch": 1.5638297872340425, "eval_loss": 1.4330610036849976, "eval_runtime": 41.5952, "eval_samples_per_second": 7.212, "eval_steps_per_second": 0.24, "step": 147 }, { "epoch": 1.5957446808510638, "grad_norm": 53.813011169433594, "learning_rate": 1.7718466074605988e-05, "loss": 0.357, "step": 150 }, { "epoch": 1.5957446808510638, "eval_loss": 1.49522864818573, "eval_runtime": 4.8438, "eval_samples_per_second": 61.935, "eval_steps_per_second": 2.065, "step": 150 }, { "epoch": 1.627659574468085, "grad_norm": 50.32698440551758, "learning_rate": 1.7315773663819488e-05, "loss": 0.3668, "step": 153 }, { "epoch": 1.627659574468085, "eval_loss": 1.5075035095214844, "eval_runtime": 42.1649, "eval_samples_per_second": 7.115, "eval_steps_per_second": 0.237, "step": 153 }, { "epoch": 1.6595744680851063, "grad_norm": 29.047195434570312, "learning_rate": 1.6913081253032987e-05, "loss": 0.257, "step": 156 }, { "epoch": 1.6595744680851063, "eval_loss": 1.499000072479248, "eval_runtime": 5.1976, "eval_samples_per_second": 57.719, "eval_steps_per_second": 1.924, "step": 156 }, { "epoch": 1.6914893617021276, "grad_norm": 44.64079666137695, "learning_rate": 1.6510388842246487e-05, "loss": 0.2812, "step": 159 }, { "epoch": 1.6914893617021276, "eval_loss": 1.5078258514404297, "eval_runtime": 41.7195, "eval_samples_per_second": 7.191, "eval_steps_per_second": 0.24, "step": 159 }, { "epoch": 1.7234042553191489, "grad_norm": 45.4018669128418, "learning_rate": 1.6107696431459987e-05, "loss": 0.3697, "step": 162 }, { "epoch": 1.7234042553191489, "eval_loss": 1.4968957901000977, "eval_runtime": 4.6701, "eval_samples_per_second": 64.239, "eval_steps_per_second": 2.141, "step": 162 }, { "epoch": 1.7553191489361701, "grad_norm": 23.335609436035156, "learning_rate": 1.5705004020673486e-05, "loss": 0.2935, "step": 165 }, { "epoch": 1.7553191489361701, "eval_loss": 1.4559073448181152, "eval_runtime": 41.376, "eval_samples_per_second": 7.251, "eval_steps_per_second": 0.242, "step": 165 }, { "epoch": 1.7872340425531914, "grad_norm": 38.07520294189453, "learning_rate": 1.5302311609886986e-05, "loss": 0.2622, "step": 168 }, { "epoch": 1.7872340425531914, "eval_loss": 1.4034123420715332, "eval_runtime": 4.1798, "eval_samples_per_second": 71.773, "eval_steps_per_second": 2.392, "step": 168 }, { "epoch": 1.8191489361702127, "grad_norm": 79.16899871826172, "learning_rate": 1.4899619199100489e-05, "loss": 0.4383, "step": 171 }, { "epoch": 1.8191489361702127, "eval_loss": 1.379632830619812, "eval_runtime": 41.4885, "eval_samples_per_second": 7.231, "eval_steps_per_second": 0.241, "step": 171 }, { "epoch": 1.851063829787234, "grad_norm": 56.25248718261719, "learning_rate": 1.4496926788313989e-05, "loss": 0.2953, "step": 174 }, { "epoch": 1.851063829787234, "eval_loss": 1.3718376159667969, "eval_runtime": 4.2567, "eval_samples_per_second": 70.478, "eval_steps_per_second": 2.349, "step": 174 }, { "epoch": 1.8829787234042552, "grad_norm": 45.190975189208984, "learning_rate": 1.4094234377527488e-05, "loss": 0.3427, "step": 177 }, { "epoch": 1.8829787234042552, "eval_loss": 1.3516465425491333, "eval_runtime": 41.6735, "eval_samples_per_second": 7.199, "eval_steps_per_second": 0.24, "step": 177 }, { "epoch": 1.9148936170212765, "grad_norm": 28.82175064086914, "learning_rate": 1.369154196674099e-05, "loss": 0.2204, "step": 180 }, { "epoch": 1.9148936170212765, "eval_loss": 1.3585424423217773, "eval_runtime": 4.1577, "eval_samples_per_second": 72.155, "eval_steps_per_second": 2.405, "step": 180 }, { "epoch": 1.9468085106382977, "grad_norm": 46.982139587402344, "learning_rate": 1.3288849555954491e-05, "loss": 0.2, "step": 183 }, { "epoch": 1.9468085106382977, "eval_loss": 1.3777296543121338, "eval_runtime": 41.8859, "eval_samples_per_second": 7.162, "eval_steps_per_second": 0.239, "step": 183 }, { "epoch": 1.978723404255319, "grad_norm": 22.6469669342041, "learning_rate": 1.288615714516799e-05, "loss": 0.2074, "step": 186 }, { "epoch": 1.978723404255319, "eval_loss": 1.3690061569213867, "eval_runtime": 4.6763, "eval_samples_per_second": 64.154, "eval_steps_per_second": 2.138, "step": 186 }, { "epoch": 2.0106382978723403, "grad_norm": 12.022841453552246, "learning_rate": 1.248346473438149e-05, "loss": 0.0972, "step": 189 }, { "epoch": 2.0106382978723403, "eval_loss": 1.354335069656372, "eval_runtime": 55.4961, "eval_samples_per_second": 5.406, "eval_steps_per_second": 0.18, "step": 189 }, { "epoch": 2.0425531914893615, "grad_norm": 32.32695007324219, "learning_rate": 1.2080772323594992e-05, "loss": 0.1505, "step": 192 }, { "epoch": 2.0425531914893615, "eval_loss": 1.3698673248291016, "eval_runtime": 34.6113, "eval_samples_per_second": 8.668, "eval_steps_per_second": 0.289, "step": 192 }, { "epoch": 2.074468085106383, "grad_norm": 20.132957458496094, "learning_rate": 1.1678079912808491e-05, "loss": 0.0651, "step": 195 }, { "epoch": 2.074468085106383, "eval_loss": 1.4110251665115356, "eval_runtime": 12.5667, "eval_samples_per_second": 23.873, "eval_steps_per_second": 0.796, "step": 195 }, { "epoch": 2.106382978723404, "grad_norm": 16.97626495361328, "learning_rate": 1.1275387502021991e-05, "loss": 0.045, "step": 198 }, { "epoch": 2.106382978723404, "eval_loss": 1.4494677782058716, "eval_runtime": 34.2066, "eval_samples_per_second": 8.77, "eval_steps_per_second": 0.292, "step": 198 }, { "epoch": 2.1382978723404253, "grad_norm": 27.48331642150879, "learning_rate": 1.0872695091235492e-05, "loss": 0.1105, "step": 201 }, { "epoch": 2.1382978723404253, "eval_loss": 1.4678298234939575, "eval_runtime": 12.6413, "eval_samples_per_second": 23.732, "eval_steps_per_second": 0.791, "step": 201 }, { "epoch": 2.1702127659574466, "grad_norm": 9.6218900680542, "learning_rate": 1.0470002680448992e-05, "loss": 0.0703, "step": 204 }, { "epoch": 2.1702127659574466, "eval_loss": 1.4693076610565186, "eval_runtime": 33.7057, "eval_samples_per_second": 8.901, "eval_steps_per_second": 0.297, "step": 204 }, { "epoch": 2.202127659574468, "grad_norm": 12.747748374938965, "learning_rate": 1.0067310269662492e-05, "loss": 0.0793, "step": 207 }, { "epoch": 2.202127659574468, "eval_loss": 1.4696707725524902, "eval_runtime": 11.7313, "eval_samples_per_second": 25.573, "eval_steps_per_second": 0.852, "step": 207 }, { "epoch": 2.2340425531914896, "grad_norm": 18.998815536499023, "learning_rate": 9.664617858875991e-06, "loss": 0.0709, "step": 210 }, { "epoch": 2.2340425531914896, "eval_loss": 1.4826494455337524, "eval_runtime": 33.8588, "eval_samples_per_second": 8.86, "eval_steps_per_second": 0.295, "step": 210 }, { "epoch": 2.2659574468085104, "grad_norm": 26.24040412902832, "learning_rate": 9.261925448089493e-06, "loss": 0.133, "step": 213 }, { "epoch": 2.2659574468085104, "eval_loss": 1.484723687171936, "eval_runtime": 11.9592, "eval_samples_per_second": 25.085, "eval_steps_per_second": 0.836, "step": 213 }, { "epoch": 2.297872340425532, "grad_norm": 38.74705123901367, "learning_rate": 8.859233037302994e-06, "loss": 0.1055, "step": 216 }, { "epoch": 2.297872340425532, "eval_loss": 1.4781628847122192, "eval_runtime": 34.0608, "eval_samples_per_second": 8.808, "eval_steps_per_second": 0.294, "step": 216 }, { "epoch": 2.329787234042553, "grad_norm": 51.491939544677734, "learning_rate": 8.456540626516494e-06, "loss": 0.106, "step": 219 }, { "epoch": 2.329787234042553, "eval_loss": 1.4520379304885864, "eval_runtime": 12.1072, "eval_samples_per_second": 24.779, "eval_steps_per_second": 0.826, "step": 219 }, { "epoch": 2.3617021276595747, "grad_norm": 15.932838439941406, "learning_rate": 8.053848215729993e-06, "loss": 0.0553, "step": 222 }, { "epoch": 2.3617021276595747, "eval_loss": 1.4323525428771973, "eval_runtime": 34.1547, "eval_samples_per_second": 8.784, "eval_steps_per_second": 0.293, "step": 222 }, { "epoch": 2.393617021276596, "grad_norm": 14.132286071777344, "learning_rate": 7.651155804943493e-06, "loss": 0.0598, "step": 225 }, { "epoch": 2.393617021276596, "eval_loss": 1.4186018705368042, "eval_runtime": 12.5329, "eval_samples_per_second": 23.937, "eval_steps_per_second": 0.798, "step": 225 }, { "epoch": 2.425531914893617, "grad_norm": 43.08507537841797, "learning_rate": 7.248463394156994e-06, "loss": 0.0903, "step": 228 }, { "epoch": 2.425531914893617, "eval_loss": 1.4079523086547852, "eval_runtime": 33.5371, "eval_samples_per_second": 8.945, "eval_steps_per_second": 0.298, "step": 228 }, { "epoch": 2.4574468085106385, "grad_norm": 18.148130416870117, "learning_rate": 6.845770983370495e-06, "loss": 0.0379, "step": 231 }, { "epoch": 2.4574468085106385, "eval_loss": 1.402553677558899, "eval_runtime": 12.5951, "eval_samples_per_second": 23.819, "eval_steps_per_second": 0.794, "step": 231 }, { "epoch": 2.4893617021276597, "grad_norm": 18.774492263793945, "learning_rate": 6.443078572583995e-06, "loss": 0.056, "step": 234 }, { "epoch": 2.4893617021276597, "eval_loss": 1.4105722904205322, "eval_runtime": 34.0705, "eval_samples_per_second": 8.805, "eval_steps_per_second": 0.294, "step": 234 }, { "epoch": 2.521276595744681, "grad_norm": 21.43934440612793, "learning_rate": 6.040386161797496e-06, "loss": 0.0753, "step": 237 }, { "epoch": 2.521276595744681, "eval_loss": 1.429024338722229, "eval_runtime": 11.7342, "eval_samples_per_second": 25.566, "eval_steps_per_second": 0.852, "step": 237 }, { "epoch": 2.5531914893617023, "grad_norm": 21.858001708984375, "learning_rate": 5.6376937510109955e-06, "loss": 0.0677, "step": 240 }, { "epoch": 2.5531914893617023, "eval_loss": 1.449249267578125, "eval_runtime": 34.3754, "eval_samples_per_second": 8.727, "eval_steps_per_second": 0.291, "step": 240 }, { "epoch": 2.5851063829787235, "grad_norm": 18.75592803955078, "learning_rate": 5.235001340224496e-06, "loss": 0.0636, "step": 243 }, { "epoch": 2.5851063829787235, "eval_loss": 1.4570808410644531, "eval_runtime": 11.6793, "eval_samples_per_second": 25.687, "eval_steps_per_second": 0.856, "step": 243 }, { "epoch": 2.617021276595745, "grad_norm": 23.944276809692383, "learning_rate": 4.832308929437996e-06, "loss": 0.0748, "step": 246 }, { "epoch": 2.617021276595745, "eval_loss": 1.4605278968811035, "eval_runtime": 33.9241, "eval_samples_per_second": 8.843, "eval_steps_per_second": 0.295, "step": 246 }, { "epoch": 2.648936170212766, "grad_norm": 9.529204368591309, "learning_rate": 4.429616518651497e-06, "loss": 0.0271, "step": 249 }, { "epoch": 2.648936170212766, "eval_loss": 1.4531209468841553, "eval_runtime": 11.8506, "eval_samples_per_second": 25.315, "eval_steps_per_second": 0.844, "step": 249 }, { "epoch": 2.6808510638297873, "grad_norm": 3.7583017349243164, "learning_rate": 4.026924107864997e-06, "loss": 0.0542, "step": 252 }, { "epoch": 2.6808510638297873, "eval_loss": 1.4430041313171387, "eval_runtime": 33.7906, "eval_samples_per_second": 8.878, "eval_steps_per_second": 0.296, "step": 252 }, { "epoch": 2.7127659574468086, "grad_norm": 12.694750785827637, "learning_rate": 3.624231697078497e-06, "loss": 0.0332, "step": 255 }, { "epoch": 2.7127659574468086, "eval_loss": 1.4361652135849, "eval_runtime": 11.6664, "eval_samples_per_second": 25.715, "eval_steps_per_second": 0.857, "step": 255 }, { "epoch": 2.74468085106383, "grad_norm": 36.67692184448242, "learning_rate": 3.2215392862919977e-06, "loss": 0.1035, "step": 258 }, { "epoch": 2.74468085106383, "eval_loss": 1.430528998374939, "eval_runtime": 33.9464, "eval_samples_per_second": 8.837, "eval_steps_per_second": 0.295, "step": 258 }, { "epoch": 2.776595744680851, "grad_norm": 30.15072250366211, "learning_rate": 2.8188468755054977e-06, "loss": 0.0643, "step": 261 }, { "epoch": 2.776595744680851, "eval_loss": 1.4278947114944458, "eval_runtime": 12.1733, "eval_samples_per_second": 24.644, "eval_steps_per_second": 0.821, "step": 261 }, { "epoch": 2.8085106382978724, "grad_norm": 5.219892978668213, "learning_rate": 2.416154464718998e-06, "loss": 0.0549, "step": 264 }, { "epoch": 2.8085106382978724, "eval_loss": 1.4259532690048218, "eval_runtime": 33.872, "eval_samples_per_second": 8.857, "eval_steps_per_second": 0.295, "step": 264 }, { "epoch": 2.8404255319148937, "grad_norm": 20.615497589111328, "learning_rate": 2.0134620539324983e-06, "loss": 0.0495, "step": 267 }, { "epoch": 2.8404255319148937, "eval_loss": 1.4268009662628174, "eval_runtime": 12.5283, "eval_samples_per_second": 23.946, "eval_steps_per_second": 0.798, "step": 267 }, { "epoch": 2.872340425531915, "grad_norm": 0.853824257850647, "learning_rate": 1.6107696431459988e-06, "loss": 0.0152, "step": 270 }, { "epoch": 2.872340425531915, "eval_loss": 1.427072525024414, "eval_runtime": 33.8909, "eval_samples_per_second": 8.852, "eval_steps_per_second": 0.295, "step": 270 }, { "epoch": 2.904255319148936, "grad_norm": 6.235328197479248, "learning_rate": 1.208077232359499e-06, "loss": 0.0806, "step": 273 }, { "epoch": 2.904255319148936, "eval_loss": 1.4273793697357178, "eval_runtime": 12.4911, "eval_samples_per_second": 24.017, "eval_steps_per_second": 0.801, "step": 273 }, { "epoch": 2.9361702127659575, "grad_norm": 17.21672248840332, "learning_rate": 8.053848215729994e-07, "loss": 0.0564, "step": 276 }, { "epoch": 2.9361702127659575, "eval_loss": 1.4273569583892822, "eval_runtime": 33.6977, "eval_samples_per_second": 8.903, "eval_steps_per_second": 0.297, "step": 276 }, { "epoch": 2.9680851063829787, "grad_norm": 23.0670166015625, "learning_rate": 4.026924107864997e-07, "loss": 0.0726, "step": 279 }, { "epoch": 2.9680851063829787, "eval_loss": 1.4270917177200317, "eval_runtime": 11.7754, "eval_samples_per_second": 25.477, "eval_steps_per_second": 0.849, "step": 279 }, { "epoch": 3.0, "grad_norm": 20.24632453918457, "learning_rate": 0.0, "loss": 0.0332, "step": 282 }, { "epoch": 3.0, "eval_loss": 1.427086591720581, "eval_runtime": 34.0405, "eval_samples_per_second": 8.813, "eval_steps_per_second": 0.294, "step": 282 } ], "logging_steps": 3, "max_steps": 282, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 523328480700102.0, "train_batch_size": 32, "trial_name": null, "trial_params": { "_wandb": {}, "assignments": {}, "decay": 0.1, "learning_rate": 3.785308661393097e-05, "metric": "eval/loss", "per_device_train_batch_size": 32 } }