|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 3.0, |
|
"eval_steps": 3, |
|
"global_step": 282, |
|
"is_hyper_param_search": true, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.031914893617021274, |
|
"grad_norm": 212.81973266601562, |
|
"learning_rate": 3.7450394203144474e-05, |
|
"loss": 2.2611, |
|
"step": 3 |
|
}, |
|
{ |
|
"epoch": 0.031914893617021274, |
|
"eval_loss": 1.6954327821731567, |
|
"eval_runtime": 90.1393, |
|
"eval_samples_per_second": 3.328, |
|
"eval_steps_per_second": 0.111, |
|
"step": 3 |
|
}, |
|
{ |
|
"epoch": 0.06382978723404255, |
|
"grad_norm": 94.31295776367188, |
|
"learning_rate": 3.704770179235797e-05, |
|
"loss": 2.1851, |
|
"step": 6 |
|
}, |
|
{ |
|
"epoch": 0.06382978723404255, |
|
"eval_loss": 1.83690345287323, |
|
"eval_runtime": 45.0365, |
|
"eval_samples_per_second": 6.661, |
|
"eval_steps_per_second": 0.222, |
|
"step": 6 |
|
}, |
|
{ |
|
"epoch": 0.09574468085106383, |
|
"grad_norm": 88.24703216552734, |
|
"learning_rate": 3.6645009381571474e-05, |
|
"loss": 1.9095, |
|
"step": 9 |
|
}, |
|
{ |
|
"epoch": 0.09574468085106383, |
|
"eval_loss": 1.8263834714889526, |
|
"eval_runtime": 44.4544, |
|
"eval_samples_per_second": 6.748, |
|
"eval_steps_per_second": 0.225, |
|
"step": 9 |
|
}, |
|
{ |
|
"epoch": 0.1276595744680851, |
|
"grad_norm": 73.16114044189453, |
|
"learning_rate": 3.6242316970784977e-05, |
|
"loss": 1.6022, |
|
"step": 12 |
|
}, |
|
{ |
|
"epoch": 0.1276595744680851, |
|
"eval_loss": 1.6566990613937378, |
|
"eval_runtime": 45.0842, |
|
"eval_samples_per_second": 6.654, |
|
"eval_steps_per_second": 0.222, |
|
"step": 12 |
|
}, |
|
{ |
|
"epoch": 0.1595744680851064, |
|
"grad_norm": 74.73725128173828, |
|
"learning_rate": 3.583962455999847e-05, |
|
"loss": 1.5181, |
|
"step": 15 |
|
}, |
|
{ |
|
"epoch": 0.1595744680851064, |
|
"eval_loss": 1.6280070543289185, |
|
"eval_runtime": 45.1381, |
|
"eval_samples_per_second": 6.646, |
|
"eval_steps_per_second": 0.222, |
|
"step": 15 |
|
}, |
|
{ |
|
"epoch": 0.19148936170212766, |
|
"grad_norm": 79.7146987915039, |
|
"learning_rate": 3.5436932149211976e-05, |
|
"loss": 1.8478, |
|
"step": 18 |
|
}, |
|
{ |
|
"epoch": 0.19148936170212766, |
|
"eval_loss": 1.5796161890029907, |
|
"eval_runtime": 45.9387, |
|
"eval_samples_per_second": 6.53, |
|
"eval_steps_per_second": 0.218, |
|
"step": 18 |
|
}, |
|
{ |
|
"epoch": 0.22340425531914893, |
|
"grad_norm": 65.3478775024414, |
|
"learning_rate": 3.503423973842547e-05, |
|
"loss": 1.3884, |
|
"step": 21 |
|
}, |
|
{ |
|
"epoch": 0.22340425531914893, |
|
"eval_loss": 1.5002487897872925, |
|
"eval_runtime": 45.7227, |
|
"eval_samples_per_second": 6.561, |
|
"eval_steps_per_second": 0.219, |
|
"step": 21 |
|
}, |
|
{ |
|
"epoch": 0.2553191489361702, |
|
"grad_norm": 66.32596588134766, |
|
"learning_rate": 3.4631547327638975e-05, |
|
"loss": 1.4368, |
|
"step": 24 |
|
}, |
|
{ |
|
"epoch": 0.2553191489361702, |
|
"eval_loss": 1.5191569328308105, |
|
"eval_runtime": 44.4497, |
|
"eval_samples_per_second": 6.749, |
|
"eval_steps_per_second": 0.225, |
|
"step": 24 |
|
}, |
|
{ |
|
"epoch": 0.2872340425531915, |
|
"grad_norm": 67.5133056640625, |
|
"learning_rate": 3.422885491685248e-05, |
|
"loss": 1.6515, |
|
"step": 27 |
|
}, |
|
{ |
|
"epoch": 0.2872340425531915, |
|
"eval_loss": 1.4516819715499878, |
|
"eval_runtime": 44.968, |
|
"eval_samples_per_second": 6.671, |
|
"eval_steps_per_second": 0.222, |
|
"step": 27 |
|
}, |
|
{ |
|
"epoch": 0.3191489361702128, |
|
"grad_norm": 56.178218841552734, |
|
"learning_rate": 3.3826162506065975e-05, |
|
"loss": 1.5518, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.3191489361702128, |
|
"eval_loss": 1.4015744924545288, |
|
"eval_runtime": 44.5216, |
|
"eval_samples_per_second": 6.738, |
|
"eval_steps_per_second": 0.225, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.35106382978723405, |
|
"grad_norm": 62.902313232421875, |
|
"learning_rate": 3.342347009527948e-05, |
|
"loss": 1.339, |
|
"step": 33 |
|
}, |
|
{ |
|
"epoch": 0.35106382978723405, |
|
"eval_loss": 1.3566683530807495, |
|
"eval_runtime": 44.9809, |
|
"eval_samples_per_second": 6.669, |
|
"eval_steps_per_second": 0.222, |
|
"step": 33 |
|
}, |
|
{ |
|
"epoch": 0.3829787234042553, |
|
"grad_norm": 57.751197814941406, |
|
"learning_rate": 3.3020777684492974e-05, |
|
"loss": 1.6338, |
|
"step": 36 |
|
}, |
|
{ |
|
"epoch": 0.3829787234042553, |
|
"eval_loss": 1.396578073501587, |
|
"eval_runtime": 45.3817, |
|
"eval_samples_per_second": 6.611, |
|
"eval_steps_per_second": 0.22, |
|
"step": 36 |
|
}, |
|
{ |
|
"epoch": 0.4148936170212766, |
|
"grad_norm": 57.7042121887207, |
|
"learning_rate": 3.261808527370648e-05, |
|
"loss": 1.4372, |
|
"step": 39 |
|
}, |
|
{ |
|
"epoch": 0.4148936170212766, |
|
"eval_loss": 1.3969274759292603, |
|
"eval_runtime": 63.0438, |
|
"eval_samples_per_second": 4.759, |
|
"eval_steps_per_second": 0.159, |
|
"step": 39 |
|
}, |
|
{ |
|
"epoch": 0.44680851063829785, |
|
"grad_norm": 43.087100982666016, |
|
"learning_rate": 3.221539286291997e-05, |
|
"loss": 1.2751, |
|
"step": 42 |
|
}, |
|
{ |
|
"epoch": 0.44680851063829785, |
|
"eval_loss": 1.3432629108428955, |
|
"eval_runtime": 43.0776, |
|
"eval_samples_per_second": 6.964, |
|
"eval_steps_per_second": 0.232, |
|
"step": 42 |
|
}, |
|
{ |
|
"epoch": 0.4787234042553192, |
|
"grad_norm": 58.544620513916016, |
|
"learning_rate": 3.1812700452133476e-05, |
|
"loss": 1.484, |
|
"step": 45 |
|
}, |
|
{ |
|
"epoch": 0.4787234042553192, |
|
"eval_loss": 1.389102578163147, |
|
"eval_runtime": 42.2187, |
|
"eval_samples_per_second": 7.106, |
|
"eval_steps_per_second": 0.237, |
|
"step": 45 |
|
}, |
|
{ |
|
"epoch": 0.5106382978723404, |
|
"grad_norm": 40.23255920410156, |
|
"learning_rate": 3.141000804134697e-05, |
|
"loss": 1.4936, |
|
"step": 48 |
|
}, |
|
{ |
|
"epoch": 0.5106382978723404, |
|
"eval_loss": 1.408040165901184, |
|
"eval_runtime": 42.8634, |
|
"eval_samples_per_second": 6.999, |
|
"eval_steps_per_second": 0.233, |
|
"step": 48 |
|
}, |
|
{ |
|
"epoch": 0.5425531914893617, |
|
"grad_norm": 58.45737075805664, |
|
"learning_rate": 3.1007315630560475e-05, |
|
"loss": 1.6771, |
|
"step": 51 |
|
}, |
|
{ |
|
"epoch": 0.5425531914893617, |
|
"eval_loss": 1.3876069784164429, |
|
"eval_runtime": 47.7351, |
|
"eval_samples_per_second": 6.285, |
|
"eval_steps_per_second": 0.209, |
|
"step": 51 |
|
}, |
|
{ |
|
"epoch": 0.574468085106383, |
|
"grad_norm": 59.58686447143555, |
|
"learning_rate": 3.060462321977397e-05, |
|
"loss": 1.7328, |
|
"step": 54 |
|
}, |
|
{ |
|
"epoch": 0.574468085106383, |
|
"eval_loss": 1.3280422687530518, |
|
"eval_runtime": 45.1191, |
|
"eval_samples_per_second": 6.649, |
|
"eval_steps_per_second": 0.222, |
|
"step": 54 |
|
}, |
|
{ |
|
"epoch": 0.6063829787234043, |
|
"grad_norm": 49.796451568603516, |
|
"learning_rate": 3.0201930808987475e-05, |
|
"loss": 1.3566, |
|
"step": 57 |
|
}, |
|
{ |
|
"epoch": 0.6063829787234043, |
|
"eval_loss": 1.3299312591552734, |
|
"eval_runtime": 45.4372, |
|
"eval_samples_per_second": 6.603, |
|
"eval_steps_per_second": 0.22, |
|
"step": 57 |
|
}, |
|
{ |
|
"epoch": 0.6382978723404256, |
|
"grad_norm": 47.63386154174805, |
|
"learning_rate": 2.9799238398200978e-05, |
|
"loss": 1.1698, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.6382978723404256, |
|
"eval_loss": 1.3619319200515747, |
|
"eval_runtime": 44.5898, |
|
"eval_samples_per_second": 6.728, |
|
"eval_steps_per_second": 0.224, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.6702127659574468, |
|
"grad_norm": 51.1750602722168, |
|
"learning_rate": 2.9396545987414478e-05, |
|
"loss": 1.4553, |
|
"step": 63 |
|
}, |
|
{ |
|
"epoch": 0.6702127659574468, |
|
"eval_loss": 1.2944597005844116, |
|
"eval_runtime": 44.0116, |
|
"eval_samples_per_second": 6.816, |
|
"eval_steps_per_second": 0.227, |
|
"step": 63 |
|
}, |
|
{ |
|
"epoch": 0.7021276595744681, |
|
"grad_norm": 56.35239028930664, |
|
"learning_rate": 2.8993853576627977e-05, |
|
"loss": 1.4532, |
|
"step": 66 |
|
}, |
|
{ |
|
"epoch": 0.7021276595744681, |
|
"eval_loss": 1.3396683931350708, |
|
"eval_runtime": 57.5395, |
|
"eval_samples_per_second": 5.214, |
|
"eval_steps_per_second": 0.174, |
|
"step": 66 |
|
}, |
|
{ |
|
"epoch": 0.7340425531914894, |
|
"grad_norm": 55.66822052001953, |
|
"learning_rate": 2.859116116584148e-05, |
|
"loss": 1.5623, |
|
"step": 69 |
|
}, |
|
{ |
|
"epoch": 0.7340425531914894, |
|
"eval_loss": 1.3019959926605225, |
|
"eval_runtime": 42.7505, |
|
"eval_samples_per_second": 7.017, |
|
"eval_steps_per_second": 0.234, |
|
"step": 69 |
|
}, |
|
{ |
|
"epoch": 0.7659574468085106, |
|
"grad_norm": 83.2627944946289, |
|
"learning_rate": 2.8188468755054976e-05, |
|
"loss": 1.5575, |
|
"step": 72 |
|
}, |
|
{ |
|
"epoch": 0.7659574468085106, |
|
"eval_loss": 1.2901865243911743, |
|
"eval_runtime": 42.3414, |
|
"eval_samples_per_second": 7.085, |
|
"eval_steps_per_second": 0.236, |
|
"step": 72 |
|
}, |
|
{ |
|
"epoch": 0.7978723404255319, |
|
"grad_norm": 66.81884002685547, |
|
"learning_rate": 2.778577634426848e-05, |
|
"loss": 1.4042, |
|
"step": 75 |
|
}, |
|
{ |
|
"epoch": 0.7978723404255319, |
|
"eval_loss": 1.2550606727600098, |
|
"eval_runtime": 42.5955, |
|
"eval_samples_per_second": 7.043, |
|
"eval_steps_per_second": 0.235, |
|
"step": 75 |
|
}, |
|
{ |
|
"epoch": 0.8297872340425532, |
|
"grad_norm": 47.388999938964844, |
|
"learning_rate": 2.738308393348198e-05, |
|
"loss": 1.3868, |
|
"step": 78 |
|
}, |
|
{ |
|
"epoch": 0.8297872340425532, |
|
"eval_loss": 1.2457486391067505, |
|
"eval_runtime": 42.5244, |
|
"eval_samples_per_second": 7.055, |
|
"eval_steps_per_second": 0.235, |
|
"step": 78 |
|
}, |
|
{ |
|
"epoch": 0.8617021276595744, |
|
"grad_norm": 51.87979507446289, |
|
"learning_rate": 2.698039152269548e-05, |
|
"loss": 1.3286, |
|
"step": 81 |
|
}, |
|
{ |
|
"epoch": 0.8617021276595744, |
|
"eval_loss": 1.2376227378845215, |
|
"eval_runtime": 42.3825, |
|
"eval_samples_per_second": 7.078, |
|
"eval_steps_per_second": 0.236, |
|
"step": 81 |
|
}, |
|
{ |
|
"epoch": 0.8936170212765957, |
|
"grad_norm": 61.56190490722656, |
|
"learning_rate": 2.6577699111908982e-05, |
|
"loss": 1.2435, |
|
"step": 84 |
|
}, |
|
{ |
|
"epoch": 0.8936170212765957, |
|
"eval_loss": 1.1932413578033447, |
|
"eval_runtime": 43.0229, |
|
"eval_samples_per_second": 6.973, |
|
"eval_steps_per_second": 0.232, |
|
"step": 84 |
|
}, |
|
{ |
|
"epoch": 0.925531914893617, |
|
"grad_norm": 56.688331604003906, |
|
"learning_rate": 2.617500670112248e-05, |
|
"loss": 1.1201, |
|
"step": 87 |
|
}, |
|
{ |
|
"epoch": 0.925531914893617, |
|
"eval_loss": 1.1663594245910645, |
|
"eval_runtime": 42.6963, |
|
"eval_samples_per_second": 7.026, |
|
"eval_steps_per_second": 0.234, |
|
"step": 87 |
|
}, |
|
{ |
|
"epoch": 0.9574468085106383, |
|
"grad_norm": 54.14950180053711, |
|
"learning_rate": 2.577231429033598e-05, |
|
"loss": 1.386, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.9574468085106383, |
|
"eval_loss": 1.1541115045547485, |
|
"eval_runtime": 42.4377, |
|
"eval_samples_per_second": 7.069, |
|
"eval_steps_per_second": 0.236, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.9893617021276596, |
|
"grad_norm": 63.0470085144043, |
|
"learning_rate": 2.536962187954948e-05, |
|
"loss": 1.0404, |
|
"step": 93 |
|
}, |
|
{ |
|
"epoch": 0.9893617021276596, |
|
"eval_loss": 1.1485306024551392, |
|
"eval_runtime": 42.3281, |
|
"eval_samples_per_second": 7.087, |
|
"eval_steps_per_second": 0.236, |
|
"step": 93 |
|
}, |
|
{ |
|
"epoch": 1.0212765957446808, |
|
"grad_norm": 29.81873321533203, |
|
"learning_rate": 2.496692946876298e-05, |
|
"loss": 0.7278, |
|
"step": 96 |
|
}, |
|
{ |
|
"epoch": 1.0212765957446808, |
|
"eval_loss": 1.1454874277114868, |
|
"eval_runtime": 4.1684, |
|
"eval_samples_per_second": 71.97, |
|
"eval_steps_per_second": 2.399, |
|
"step": 96 |
|
}, |
|
{ |
|
"epoch": 1.053191489361702, |
|
"grad_norm": 33.473297119140625, |
|
"learning_rate": 2.456423705797648e-05, |
|
"loss": 0.3543, |
|
"step": 99 |
|
}, |
|
{ |
|
"epoch": 1.053191489361702, |
|
"eval_loss": 1.177135944366455, |
|
"eval_runtime": 42.0082, |
|
"eval_samples_per_second": 7.141, |
|
"eval_steps_per_second": 0.238, |
|
"step": 99 |
|
}, |
|
{ |
|
"epoch": 1.0851063829787233, |
|
"grad_norm": 23.966768264770508, |
|
"learning_rate": 2.4161544647189983e-05, |
|
"loss": 0.2839, |
|
"step": 102 |
|
}, |
|
{ |
|
"epoch": 1.0851063829787233, |
|
"eval_loss": 1.2594685554504395, |
|
"eval_runtime": 4.1814, |
|
"eval_samples_per_second": 71.746, |
|
"eval_steps_per_second": 2.392, |
|
"step": 102 |
|
}, |
|
{ |
|
"epoch": 1.1170212765957448, |
|
"grad_norm": 23.973447799682617, |
|
"learning_rate": 2.375885223640348e-05, |
|
"loss": 0.2303, |
|
"step": 105 |
|
}, |
|
{ |
|
"epoch": 1.1170212765957448, |
|
"eval_loss": 1.3750962018966675, |
|
"eval_runtime": 42.0396, |
|
"eval_samples_per_second": 7.136, |
|
"eval_steps_per_second": 0.238, |
|
"step": 105 |
|
}, |
|
{ |
|
"epoch": 1.148936170212766, |
|
"grad_norm": 36.35573959350586, |
|
"learning_rate": 2.3356159825616983e-05, |
|
"loss": 0.3702, |
|
"step": 108 |
|
}, |
|
{ |
|
"epoch": 1.148936170212766, |
|
"eval_loss": 1.4052996635437012, |
|
"eval_runtime": 4.1732, |
|
"eval_samples_per_second": 71.887, |
|
"eval_steps_per_second": 2.396, |
|
"step": 108 |
|
}, |
|
{ |
|
"epoch": 1.1808510638297873, |
|
"grad_norm": 37.33869171142578, |
|
"learning_rate": 2.2953467414830486e-05, |
|
"loss": 0.37, |
|
"step": 111 |
|
}, |
|
{ |
|
"epoch": 1.1808510638297873, |
|
"eval_loss": 1.4478391408920288, |
|
"eval_runtime": 42.2951, |
|
"eval_samples_per_second": 7.093, |
|
"eval_steps_per_second": 0.236, |
|
"step": 111 |
|
}, |
|
{ |
|
"epoch": 1.2127659574468086, |
|
"grad_norm": 32.96900177001953, |
|
"learning_rate": 2.2550775004043982e-05, |
|
"loss": 0.2412, |
|
"step": 114 |
|
}, |
|
{ |
|
"epoch": 1.2127659574468086, |
|
"eval_loss": 1.4774221181869507, |
|
"eval_runtime": 5.0258, |
|
"eval_samples_per_second": 59.693, |
|
"eval_steps_per_second": 1.99, |
|
"step": 114 |
|
}, |
|
{ |
|
"epoch": 1.2446808510638299, |
|
"grad_norm": 45.11040496826172, |
|
"learning_rate": 2.2148082593257485e-05, |
|
"loss": 0.3028, |
|
"step": 117 |
|
}, |
|
{ |
|
"epoch": 1.2446808510638299, |
|
"eval_loss": 1.4591237306594849, |
|
"eval_runtime": 41.5496, |
|
"eval_samples_per_second": 7.22, |
|
"eval_steps_per_second": 0.241, |
|
"step": 117 |
|
}, |
|
{ |
|
"epoch": 1.2765957446808511, |
|
"grad_norm": 31.62494659423828, |
|
"learning_rate": 2.1745390182470985e-05, |
|
"loss": 0.322, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 1.2765957446808511, |
|
"eval_loss": 1.483196496963501, |
|
"eval_runtime": 4.901, |
|
"eval_samples_per_second": 61.212, |
|
"eval_steps_per_second": 2.04, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 1.3085106382978724, |
|
"grad_norm": 53.78076934814453, |
|
"learning_rate": 2.1342697771684484e-05, |
|
"loss": 0.4919, |
|
"step": 123 |
|
}, |
|
{ |
|
"epoch": 1.3085106382978724, |
|
"eval_loss": 1.4939560890197754, |
|
"eval_runtime": 41.6747, |
|
"eval_samples_per_second": 7.199, |
|
"eval_steps_per_second": 0.24, |
|
"step": 123 |
|
}, |
|
{ |
|
"epoch": 1.3404255319148937, |
|
"grad_norm": 54.71748352050781, |
|
"learning_rate": 2.0940005360897984e-05, |
|
"loss": 0.3868, |
|
"step": 126 |
|
}, |
|
{ |
|
"epoch": 1.3404255319148937, |
|
"eval_loss": 1.4461126327514648, |
|
"eval_runtime": 4.3166, |
|
"eval_samples_per_second": 69.498, |
|
"eval_steps_per_second": 2.317, |
|
"step": 126 |
|
}, |
|
{ |
|
"epoch": 1.372340425531915, |
|
"grad_norm": 39.04596710205078, |
|
"learning_rate": 2.0537312950111484e-05, |
|
"loss": 0.3795, |
|
"step": 129 |
|
}, |
|
{ |
|
"epoch": 1.372340425531915, |
|
"eval_loss": 1.421697735786438, |
|
"eval_runtime": 41.8322, |
|
"eval_samples_per_second": 7.172, |
|
"eval_steps_per_second": 0.239, |
|
"step": 129 |
|
}, |
|
{ |
|
"epoch": 1.4042553191489362, |
|
"grad_norm": 43.464900970458984, |
|
"learning_rate": 2.0134620539324983e-05, |
|
"loss": 0.2185, |
|
"step": 132 |
|
}, |
|
{ |
|
"epoch": 1.4042553191489362, |
|
"eval_loss": 1.4404044151306152, |
|
"eval_runtime": 4.1662, |
|
"eval_samples_per_second": 72.008, |
|
"eval_steps_per_second": 2.4, |
|
"step": 132 |
|
}, |
|
{ |
|
"epoch": 1.4361702127659575, |
|
"grad_norm": 39.858341217041016, |
|
"learning_rate": 1.9731928128538486e-05, |
|
"loss": 0.361, |
|
"step": 135 |
|
}, |
|
{ |
|
"epoch": 1.4361702127659575, |
|
"eval_loss": 1.445768117904663, |
|
"eval_runtime": 41.5719, |
|
"eval_samples_per_second": 7.216, |
|
"eval_steps_per_second": 0.241, |
|
"step": 135 |
|
}, |
|
{ |
|
"epoch": 1.4680851063829787, |
|
"grad_norm": 35.612701416015625, |
|
"learning_rate": 1.9329235717751983e-05, |
|
"loss": 0.3308, |
|
"step": 138 |
|
}, |
|
{ |
|
"epoch": 1.4680851063829787, |
|
"eval_loss": 1.447102665901184, |
|
"eval_runtime": 4.2095, |
|
"eval_samples_per_second": 71.267, |
|
"eval_steps_per_second": 2.376, |
|
"step": 138 |
|
}, |
|
{ |
|
"epoch": 1.5, |
|
"grad_norm": 40.08439636230469, |
|
"learning_rate": 1.8926543306965486e-05, |
|
"loss": 0.29, |
|
"step": 141 |
|
}, |
|
{ |
|
"epoch": 1.5, |
|
"eval_loss": 1.4261672496795654, |
|
"eval_runtime": 41.3155, |
|
"eval_samples_per_second": 7.261, |
|
"eval_steps_per_second": 0.242, |
|
"step": 141 |
|
}, |
|
{ |
|
"epoch": 1.5319148936170213, |
|
"grad_norm": 34.78931427001953, |
|
"learning_rate": 1.8523850896178985e-05, |
|
"loss": 0.2372, |
|
"step": 144 |
|
}, |
|
{ |
|
"epoch": 1.5319148936170213, |
|
"eval_loss": 1.4144420623779297, |
|
"eval_runtime": 4.1971, |
|
"eval_samples_per_second": 71.478, |
|
"eval_steps_per_second": 2.383, |
|
"step": 144 |
|
}, |
|
{ |
|
"epoch": 1.5638297872340425, |
|
"grad_norm": 45.00236892700195, |
|
"learning_rate": 1.8121158485392488e-05, |
|
"loss": 0.2666, |
|
"step": 147 |
|
}, |
|
{ |
|
"epoch": 1.5638297872340425, |
|
"eval_loss": 1.4330610036849976, |
|
"eval_runtime": 41.5952, |
|
"eval_samples_per_second": 7.212, |
|
"eval_steps_per_second": 0.24, |
|
"step": 147 |
|
}, |
|
{ |
|
"epoch": 1.5957446808510638, |
|
"grad_norm": 53.813011169433594, |
|
"learning_rate": 1.7718466074605988e-05, |
|
"loss": 0.357, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 1.5957446808510638, |
|
"eval_loss": 1.49522864818573, |
|
"eval_runtime": 4.8438, |
|
"eval_samples_per_second": 61.935, |
|
"eval_steps_per_second": 2.065, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 1.627659574468085, |
|
"grad_norm": 50.32698440551758, |
|
"learning_rate": 1.7315773663819488e-05, |
|
"loss": 0.3668, |
|
"step": 153 |
|
}, |
|
{ |
|
"epoch": 1.627659574468085, |
|
"eval_loss": 1.5075035095214844, |
|
"eval_runtime": 42.1649, |
|
"eval_samples_per_second": 7.115, |
|
"eval_steps_per_second": 0.237, |
|
"step": 153 |
|
}, |
|
{ |
|
"epoch": 1.6595744680851063, |
|
"grad_norm": 29.047195434570312, |
|
"learning_rate": 1.6913081253032987e-05, |
|
"loss": 0.257, |
|
"step": 156 |
|
}, |
|
{ |
|
"epoch": 1.6595744680851063, |
|
"eval_loss": 1.499000072479248, |
|
"eval_runtime": 5.1976, |
|
"eval_samples_per_second": 57.719, |
|
"eval_steps_per_second": 1.924, |
|
"step": 156 |
|
}, |
|
{ |
|
"epoch": 1.6914893617021276, |
|
"grad_norm": 44.64079666137695, |
|
"learning_rate": 1.6510388842246487e-05, |
|
"loss": 0.2812, |
|
"step": 159 |
|
}, |
|
{ |
|
"epoch": 1.6914893617021276, |
|
"eval_loss": 1.5078258514404297, |
|
"eval_runtime": 41.7195, |
|
"eval_samples_per_second": 7.191, |
|
"eval_steps_per_second": 0.24, |
|
"step": 159 |
|
}, |
|
{ |
|
"epoch": 1.7234042553191489, |
|
"grad_norm": 45.4018669128418, |
|
"learning_rate": 1.6107696431459987e-05, |
|
"loss": 0.3697, |
|
"step": 162 |
|
}, |
|
{ |
|
"epoch": 1.7234042553191489, |
|
"eval_loss": 1.4968957901000977, |
|
"eval_runtime": 4.6701, |
|
"eval_samples_per_second": 64.239, |
|
"eval_steps_per_second": 2.141, |
|
"step": 162 |
|
}, |
|
{ |
|
"epoch": 1.7553191489361701, |
|
"grad_norm": 23.335609436035156, |
|
"learning_rate": 1.5705004020673486e-05, |
|
"loss": 0.2935, |
|
"step": 165 |
|
}, |
|
{ |
|
"epoch": 1.7553191489361701, |
|
"eval_loss": 1.4559073448181152, |
|
"eval_runtime": 41.376, |
|
"eval_samples_per_second": 7.251, |
|
"eval_steps_per_second": 0.242, |
|
"step": 165 |
|
}, |
|
{ |
|
"epoch": 1.7872340425531914, |
|
"grad_norm": 38.07520294189453, |
|
"learning_rate": 1.5302311609886986e-05, |
|
"loss": 0.2622, |
|
"step": 168 |
|
}, |
|
{ |
|
"epoch": 1.7872340425531914, |
|
"eval_loss": 1.4034123420715332, |
|
"eval_runtime": 4.1798, |
|
"eval_samples_per_second": 71.773, |
|
"eval_steps_per_second": 2.392, |
|
"step": 168 |
|
}, |
|
{ |
|
"epoch": 1.8191489361702127, |
|
"grad_norm": 79.16899871826172, |
|
"learning_rate": 1.4899619199100489e-05, |
|
"loss": 0.4383, |
|
"step": 171 |
|
}, |
|
{ |
|
"epoch": 1.8191489361702127, |
|
"eval_loss": 1.379632830619812, |
|
"eval_runtime": 41.4885, |
|
"eval_samples_per_second": 7.231, |
|
"eval_steps_per_second": 0.241, |
|
"step": 171 |
|
}, |
|
{ |
|
"epoch": 1.851063829787234, |
|
"grad_norm": 56.25248718261719, |
|
"learning_rate": 1.4496926788313989e-05, |
|
"loss": 0.2953, |
|
"step": 174 |
|
}, |
|
{ |
|
"epoch": 1.851063829787234, |
|
"eval_loss": 1.3718376159667969, |
|
"eval_runtime": 4.2567, |
|
"eval_samples_per_second": 70.478, |
|
"eval_steps_per_second": 2.349, |
|
"step": 174 |
|
}, |
|
{ |
|
"epoch": 1.8829787234042552, |
|
"grad_norm": 45.190975189208984, |
|
"learning_rate": 1.4094234377527488e-05, |
|
"loss": 0.3427, |
|
"step": 177 |
|
}, |
|
{ |
|
"epoch": 1.8829787234042552, |
|
"eval_loss": 1.3516465425491333, |
|
"eval_runtime": 41.6735, |
|
"eval_samples_per_second": 7.199, |
|
"eval_steps_per_second": 0.24, |
|
"step": 177 |
|
}, |
|
{ |
|
"epoch": 1.9148936170212765, |
|
"grad_norm": 28.82175064086914, |
|
"learning_rate": 1.369154196674099e-05, |
|
"loss": 0.2204, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 1.9148936170212765, |
|
"eval_loss": 1.3585424423217773, |
|
"eval_runtime": 4.1577, |
|
"eval_samples_per_second": 72.155, |
|
"eval_steps_per_second": 2.405, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 1.9468085106382977, |
|
"grad_norm": 46.982139587402344, |
|
"learning_rate": 1.3288849555954491e-05, |
|
"loss": 0.2, |
|
"step": 183 |
|
}, |
|
{ |
|
"epoch": 1.9468085106382977, |
|
"eval_loss": 1.3777296543121338, |
|
"eval_runtime": 41.8859, |
|
"eval_samples_per_second": 7.162, |
|
"eval_steps_per_second": 0.239, |
|
"step": 183 |
|
}, |
|
{ |
|
"epoch": 1.978723404255319, |
|
"grad_norm": 22.6469669342041, |
|
"learning_rate": 1.288615714516799e-05, |
|
"loss": 0.2074, |
|
"step": 186 |
|
}, |
|
{ |
|
"epoch": 1.978723404255319, |
|
"eval_loss": 1.3690061569213867, |
|
"eval_runtime": 4.6763, |
|
"eval_samples_per_second": 64.154, |
|
"eval_steps_per_second": 2.138, |
|
"step": 186 |
|
}, |
|
{ |
|
"epoch": 2.0106382978723403, |
|
"grad_norm": 12.022841453552246, |
|
"learning_rate": 1.248346473438149e-05, |
|
"loss": 0.0972, |
|
"step": 189 |
|
}, |
|
{ |
|
"epoch": 2.0106382978723403, |
|
"eval_loss": 1.354335069656372, |
|
"eval_runtime": 55.4961, |
|
"eval_samples_per_second": 5.406, |
|
"eval_steps_per_second": 0.18, |
|
"step": 189 |
|
}, |
|
{ |
|
"epoch": 2.0425531914893615, |
|
"grad_norm": 32.32695007324219, |
|
"learning_rate": 1.2080772323594992e-05, |
|
"loss": 0.1505, |
|
"step": 192 |
|
}, |
|
{ |
|
"epoch": 2.0425531914893615, |
|
"eval_loss": 1.3698673248291016, |
|
"eval_runtime": 34.6113, |
|
"eval_samples_per_second": 8.668, |
|
"eval_steps_per_second": 0.289, |
|
"step": 192 |
|
}, |
|
{ |
|
"epoch": 2.074468085106383, |
|
"grad_norm": 20.132957458496094, |
|
"learning_rate": 1.1678079912808491e-05, |
|
"loss": 0.0651, |
|
"step": 195 |
|
}, |
|
{ |
|
"epoch": 2.074468085106383, |
|
"eval_loss": 1.4110251665115356, |
|
"eval_runtime": 12.5667, |
|
"eval_samples_per_second": 23.873, |
|
"eval_steps_per_second": 0.796, |
|
"step": 195 |
|
}, |
|
{ |
|
"epoch": 2.106382978723404, |
|
"grad_norm": 16.97626495361328, |
|
"learning_rate": 1.1275387502021991e-05, |
|
"loss": 0.045, |
|
"step": 198 |
|
}, |
|
{ |
|
"epoch": 2.106382978723404, |
|
"eval_loss": 1.4494677782058716, |
|
"eval_runtime": 34.2066, |
|
"eval_samples_per_second": 8.77, |
|
"eval_steps_per_second": 0.292, |
|
"step": 198 |
|
}, |
|
{ |
|
"epoch": 2.1382978723404253, |
|
"grad_norm": 27.48331642150879, |
|
"learning_rate": 1.0872695091235492e-05, |
|
"loss": 0.1105, |
|
"step": 201 |
|
}, |
|
{ |
|
"epoch": 2.1382978723404253, |
|
"eval_loss": 1.4678298234939575, |
|
"eval_runtime": 12.6413, |
|
"eval_samples_per_second": 23.732, |
|
"eval_steps_per_second": 0.791, |
|
"step": 201 |
|
}, |
|
{ |
|
"epoch": 2.1702127659574466, |
|
"grad_norm": 9.6218900680542, |
|
"learning_rate": 1.0470002680448992e-05, |
|
"loss": 0.0703, |
|
"step": 204 |
|
}, |
|
{ |
|
"epoch": 2.1702127659574466, |
|
"eval_loss": 1.4693076610565186, |
|
"eval_runtime": 33.7057, |
|
"eval_samples_per_second": 8.901, |
|
"eval_steps_per_second": 0.297, |
|
"step": 204 |
|
}, |
|
{ |
|
"epoch": 2.202127659574468, |
|
"grad_norm": 12.747748374938965, |
|
"learning_rate": 1.0067310269662492e-05, |
|
"loss": 0.0793, |
|
"step": 207 |
|
}, |
|
{ |
|
"epoch": 2.202127659574468, |
|
"eval_loss": 1.4696707725524902, |
|
"eval_runtime": 11.7313, |
|
"eval_samples_per_second": 25.573, |
|
"eval_steps_per_second": 0.852, |
|
"step": 207 |
|
}, |
|
{ |
|
"epoch": 2.2340425531914896, |
|
"grad_norm": 18.998815536499023, |
|
"learning_rate": 9.664617858875991e-06, |
|
"loss": 0.0709, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 2.2340425531914896, |
|
"eval_loss": 1.4826494455337524, |
|
"eval_runtime": 33.8588, |
|
"eval_samples_per_second": 8.86, |
|
"eval_steps_per_second": 0.295, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 2.2659574468085104, |
|
"grad_norm": 26.24040412902832, |
|
"learning_rate": 9.261925448089493e-06, |
|
"loss": 0.133, |
|
"step": 213 |
|
}, |
|
{ |
|
"epoch": 2.2659574468085104, |
|
"eval_loss": 1.484723687171936, |
|
"eval_runtime": 11.9592, |
|
"eval_samples_per_second": 25.085, |
|
"eval_steps_per_second": 0.836, |
|
"step": 213 |
|
}, |
|
{ |
|
"epoch": 2.297872340425532, |
|
"grad_norm": 38.74705123901367, |
|
"learning_rate": 8.859233037302994e-06, |
|
"loss": 0.1055, |
|
"step": 216 |
|
}, |
|
{ |
|
"epoch": 2.297872340425532, |
|
"eval_loss": 1.4781628847122192, |
|
"eval_runtime": 34.0608, |
|
"eval_samples_per_second": 8.808, |
|
"eval_steps_per_second": 0.294, |
|
"step": 216 |
|
}, |
|
{ |
|
"epoch": 2.329787234042553, |
|
"grad_norm": 51.491939544677734, |
|
"learning_rate": 8.456540626516494e-06, |
|
"loss": 0.106, |
|
"step": 219 |
|
}, |
|
{ |
|
"epoch": 2.329787234042553, |
|
"eval_loss": 1.4520379304885864, |
|
"eval_runtime": 12.1072, |
|
"eval_samples_per_second": 24.779, |
|
"eval_steps_per_second": 0.826, |
|
"step": 219 |
|
}, |
|
{ |
|
"epoch": 2.3617021276595747, |
|
"grad_norm": 15.932838439941406, |
|
"learning_rate": 8.053848215729993e-06, |
|
"loss": 0.0553, |
|
"step": 222 |
|
}, |
|
{ |
|
"epoch": 2.3617021276595747, |
|
"eval_loss": 1.4323525428771973, |
|
"eval_runtime": 34.1547, |
|
"eval_samples_per_second": 8.784, |
|
"eval_steps_per_second": 0.293, |
|
"step": 222 |
|
}, |
|
{ |
|
"epoch": 2.393617021276596, |
|
"grad_norm": 14.132286071777344, |
|
"learning_rate": 7.651155804943493e-06, |
|
"loss": 0.0598, |
|
"step": 225 |
|
}, |
|
{ |
|
"epoch": 2.393617021276596, |
|
"eval_loss": 1.4186018705368042, |
|
"eval_runtime": 12.5329, |
|
"eval_samples_per_second": 23.937, |
|
"eval_steps_per_second": 0.798, |
|
"step": 225 |
|
}, |
|
{ |
|
"epoch": 2.425531914893617, |
|
"grad_norm": 43.08507537841797, |
|
"learning_rate": 7.248463394156994e-06, |
|
"loss": 0.0903, |
|
"step": 228 |
|
}, |
|
{ |
|
"epoch": 2.425531914893617, |
|
"eval_loss": 1.4079523086547852, |
|
"eval_runtime": 33.5371, |
|
"eval_samples_per_second": 8.945, |
|
"eval_steps_per_second": 0.298, |
|
"step": 228 |
|
}, |
|
{ |
|
"epoch": 2.4574468085106385, |
|
"grad_norm": 18.148130416870117, |
|
"learning_rate": 6.845770983370495e-06, |
|
"loss": 0.0379, |
|
"step": 231 |
|
}, |
|
{ |
|
"epoch": 2.4574468085106385, |
|
"eval_loss": 1.402553677558899, |
|
"eval_runtime": 12.5951, |
|
"eval_samples_per_second": 23.819, |
|
"eval_steps_per_second": 0.794, |
|
"step": 231 |
|
}, |
|
{ |
|
"epoch": 2.4893617021276597, |
|
"grad_norm": 18.774492263793945, |
|
"learning_rate": 6.443078572583995e-06, |
|
"loss": 0.056, |
|
"step": 234 |
|
}, |
|
{ |
|
"epoch": 2.4893617021276597, |
|
"eval_loss": 1.4105722904205322, |
|
"eval_runtime": 34.0705, |
|
"eval_samples_per_second": 8.805, |
|
"eval_steps_per_second": 0.294, |
|
"step": 234 |
|
}, |
|
{ |
|
"epoch": 2.521276595744681, |
|
"grad_norm": 21.43934440612793, |
|
"learning_rate": 6.040386161797496e-06, |
|
"loss": 0.0753, |
|
"step": 237 |
|
}, |
|
{ |
|
"epoch": 2.521276595744681, |
|
"eval_loss": 1.429024338722229, |
|
"eval_runtime": 11.7342, |
|
"eval_samples_per_second": 25.566, |
|
"eval_steps_per_second": 0.852, |
|
"step": 237 |
|
}, |
|
{ |
|
"epoch": 2.5531914893617023, |
|
"grad_norm": 21.858001708984375, |
|
"learning_rate": 5.6376937510109955e-06, |
|
"loss": 0.0677, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 2.5531914893617023, |
|
"eval_loss": 1.449249267578125, |
|
"eval_runtime": 34.3754, |
|
"eval_samples_per_second": 8.727, |
|
"eval_steps_per_second": 0.291, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 2.5851063829787235, |
|
"grad_norm": 18.75592803955078, |
|
"learning_rate": 5.235001340224496e-06, |
|
"loss": 0.0636, |
|
"step": 243 |
|
}, |
|
{ |
|
"epoch": 2.5851063829787235, |
|
"eval_loss": 1.4570808410644531, |
|
"eval_runtime": 11.6793, |
|
"eval_samples_per_second": 25.687, |
|
"eval_steps_per_second": 0.856, |
|
"step": 243 |
|
}, |
|
{ |
|
"epoch": 2.617021276595745, |
|
"grad_norm": 23.944276809692383, |
|
"learning_rate": 4.832308929437996e-06, |
|
"loss": 0.0748, |
|
"step": 246 |
|
}, |
|
{ |
|
"epoch": 2.617021276595745, |
|
"eval_loss": 1.4605278968811035, |
|
"eval_runtime": 33.9241, |
|
"eval_samples_per_second": 8.843, |
|
"eval_steps_per_second": 0.295, |
|
"step": 246 |
|
}, |
|
{ |
|
"epoch": 2.648936170212766, |
|
"grad_norm": 9.529204368591309, |
|
"learning_rate": 4.429616518651497e-06, |
|
"loss": 0.0271, |
|
"step": 249 |
|
}, |
|
{ |
|
"epoch": 2.648936170212766, |
|
"eval_loss": 1.4531209468841553, |
|
"eval_runtime": 11.8506, |
|
"eval_samples_per_second": 25.315, |
|
"eval_steps_per_second": 0.844, |
|
"step": 249 |
|
}, |
|
{ |
|
"epoch": 2.6808510638297873, |
|
"grad_norm": 3.7583017349243164, |
|
"learning_rate": 4.026924107864997e-06, |
|
"loss": 0.0542, |
|
"step": 252 |
|
}, |
|
{ |
|
"epoch": 2.6808510638297873, |
|
"eval_loss": 1.4430041313171387, |
|
"eval_runtime": 33.7906, |
|
"eval_samples_per_second": 8.878, |
|
"eval_steps_per_second": 0.296, |
|
"step": 252 |
|
}, |
|
{ |
|
"epoch": 2.7127659574468086, |
|
"grad_norm": 12.694750785827637, |
|
"learning_rate": 3.624231697078497e-06, |
|
"loss": 0.0332, |
|
"step": 255 |
|
}, |
|
{ |
|
"epoch": 2.7127659574468086, |
|
"eval_loss": 1.4361652135849, |
|
"eval_runtime": 11.6664, |
|
"eval_samples_per_second": 25.715, |
|
"eval_steps_per_second": 0.857, |
|
"step": 255 |
|
}, |
|
{ |
|
"epoch": 2.74468085106383, |
|
"grad_norm": 36.67692184448242, |
|
"learning_rate": 3.2215392862919977e-06, |
|
"loss": 0.1035, |
|
"step": 258 |
|
}, |
|
{ |
|
"epoch": 2.74468085106383, |
|
"eval_loss": 1.430528998374939, |
|
"eval_runtime": 33.9464, |
|
"eval_samples_per_second": 8.837, |
|
"eval_steps_per_second": 0.295, |
|
"step": 258 |
|
}, |
|
{ |
|
"epoch": 2.776595744680851, |
|
"grad_norm": 30.15072250366211, |
|
"learning_rate": 2.8188468755054977e-06, |
|
"loss": 0.0643, |
|
"step": 261 |
|
}, |
|
{ |
|
"epoch": 2.776595744680851, |
|
"eval_loss": 1.4278947114944458, |
|
"eval_runtime": 12.1733, |
|
"eval_samples_per_second": 24.644, |
|
"eval_steps_per_second": 0.821, |
|
"step": 261 |
|
}, |
|
{ |
|
"epoch": 2.8085106382978724, |
|
"grad_norm": 5.219892978668213, |
|
"learning_rate": 2.416154464718998e-06, |
|
"loss": 0.0549, |
|
"step": 264 |
|
}, |
|
{ |
|
"epoch": 2.8085106382978724, |
|
"eval_loss": 1.4259532690048218, |
|
"eval_runtime": 33.872, |
|
"eval_samples_per_second": 8.857, |
|
"eval_steps_per_second": 0.295, |
|
"step": 264 |
|
}, |
|
{ |
|
"epoch": 2.8404255319148937, |
|
"grad_norm": 20.615497589111328, |
|
"learning_rate": 2.0134620539324983e-06, |
|
"loss": 0.0495, |
|
"step": 267 |
|
}, |
|
{ |
|
"epoch": 2.8404255319148937, |
|
"eval_loss": 1.4268009662628174, |
|
"eval_runtime": 12.5283, |
|
"eval_samples_per_second": 23.946, |
|
"eval_steps_per_second": 0.798, |
|
"step": 267 |
|
}, |
|
{ |
|
"epoch": 2.872340425531915, |
|
"grad_norm": 0.853824257850647, |
|
"learning_rate": 1.6107696431459988e-06, |
|
"loss": 0.0152, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 2.872340425531915, |
|
"eval_loss": 1.427072525024414, |
|
"eval_runtime": 33.8909, |
|
"eval_samples_per_second": 8.852, |
|
"eval_steps_per_second": 0.295, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 2.904255319148936, |
|
"grad_norm": 6.235328197479248, |
|
"learning_rate": 1.208077232359499e-06, |
|
"loss": 0.0806, |
|
"step": 273 |
|
}, |
|
{ |
|
"epoch": 2.904255319148936, |
|
"eval_loss": 1.4273793697357178, |
|
"eval_runtime": 12.4911, |
|
"eval_samples_per_second": 24.017, |
|
"eval_steps_per_second": 0.801, |
|
"step": 273 |
|
}, |
|
{ |
|
"epoch": 2.9361702127659575, |
|
"grad_norm": 17.21672248840332, |
|
"learning_rate": 8.053848215729994e-07, |
|
"loss": 0.0564, |
|
"step": 276 |
|
}, |
|
{ |
|
"epoch": 2.9361702127659575, |
|
"eval_loss": 1.4273569583892822, |
|
"eval_runtime": 33.6977, |
|
"eval_samples_per_second": 8.903, |
|
"eval_steps_per_second": 0.297, |
|
"step": 276 |
|
}, |
|
{ |
|
"epoch": 2.9680851063829787, |
|
"grad_norm": 23.0670166015625, |
|
"learning_rate": 4.026924107864997e-07, |
|
"loss": 0.0726, |
|
"step": 279 |
|
}, |
|
{ |
|
"epoch": 2.9680851063829787, |
|
"eval_loss": 1.4270917177200317, |
|
"eval_runtime": 11.7754, |
|
"eval_samples_per_second": 25.477, |
|
"eval_steps_per_second": 0.849, |
|
"step": 279 |
|
}, |
|
{ |
|
"epoch": 3.0, |
|
"grad_norm": 20.24632453918457, |
|
"learning_rate": 0.0, |
|
"loss": 0.0332, |
|
"step": 282 |
|
}, |
|
{ |
|
"epoch": 3.0, |
|
"eval_loss": 1.427086591720581, |
|
"eval_runtime": 34.0405, |
|
"eval_samples_per_second": 8.813, |
|
"eval_steps_per_second": 0.294, |
|
"step": 282 |
|
} |
|
], |
|
"logging_steps": 3, |
|
"max_steps": 282, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 3, |
|
"save_steps": 500, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 523328480700102.0, |
|
"train_batch_size": 32, |
|
"trial_name": null, |
|
"trial_params": { |
|
"_wandb": {}, |
|
"assignments": {}, |
|
"decay": 0.1, |
|
"learning_rate": 3.785308661393097e-05, |
|
"metric": "eval/loss", |
|
"per_device_train_batch_size": 32 |
|
} |
|
} |
|
|