|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 3.0, |
|
"eval_steps": 2, |
|
"global_step": 282, |
|
"is_hyper_param_search": true, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.02127659574468085, |
|
"grad_norm": 147.76431274414062, |
|
"learning_rate": 0.000495960190363068, |
|
"loss": 3.2762, |
|
"step": 2 |
|
}, |
|
{ |
|
"epoch": 0.02127659574468085, |
|
"eval_loss": 5.192628383636475, |
|
"eval_runtime": 254.7582, |
|
"eval_samples_per_second": 1.178, |
|
"eval_steps_per_second": 0.02, |
|
"step": 2 |
|
}, |
|
{ |
|
"epoch": 0.0425531914893617, |
|
"grad_norm": 34.33057403564453, |
|
"learning_rate": 0.0004924176175747603, |
|
"loss": 4.1983, |
|
"step": 4 |
|
}, |
|
{ |
|
"epoch": 0.0425531914893617, |
|
"eval_loss": 4.506951332092285, |
|
"eval_runtime": 116.2019, |
|
"eval_samples_per_second": 2.582, |
|
"eval_steps_per_second": 0.043, |
|
"step": 4 |
|
}, |
|
{ |
|
"epoch": 0.06382978723404255, |
|
"grad_norm": 2.5619542598724365, |
|
"learning_rate": 0.0004888750447864527, |
|
"loss": 3.6907, |
|
"step": 6 |
|
}, |
|
{ |
|
"epoch": 0.06382978723404255, |
|
"eval_loss": 4.125861644744873, |
|
"eval_runtime": 95.9708, |
|
"eval_samples_per_second": 3.126, |
|
"eval_steps_per_second": 0.052, |
|
"step": 6 |
|
}, |
|
{ |
|
"epoch": 0.0851063829787234, |
|
"grad_norm": 1.6348135471343994, |
|
"learning_rate": 0.0004853324719981451, |
|
"loss": 3.5109, |
|
"step": 8 |
|
}, |
|
{ |
|
"epoch": 0.0851063829787234, |
|
"eval_loss": 4.108506679534912, |
|
"eval_runtime": 95.4994, |
|
"eval_samples_per_second": 3.141, |
|
"eval_steps_per_second": 0.052, |
|
"step": 8 |
|
}, |
|
{ |
|
"epoch": 0.10638297872340426, |
|
"grad_norm": 0.02139226719737053, |
|
"learning_rate": 0.00048178989920983744, |
|
"loss": 3.4686, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.10638297872340426, |
|
"eval_loss": 4.104280471801758, |
|
"eval_runtime": 114.204, |
|
"eval_samples_per_second": 2.627, |
|
"eval_steps_per_second": 0.044, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.1276595744680851, |
|
"grad_norm": 0.005519088823348284, |
|
"learning_rate": 0.0004782473264215299, |
|
"loss": 3.466, |
|
"step": 12 |
|
}, |
|
{ |
|
"epoch": 0.1276595744680851, |
|
"eval_loss": 4.104023456573486, |
|
"eval_runtime": 96.7284, |
|
"eval_samples_per_second": 3.101, |
|
"eval_steps_per_second": 0.052, |
|
"step": 12 |
|
}, |
|
{ |
|
"epoch": 0.14893617021276595, |
|
"grad_norm": 0.005563751328736544, |
|
"learning_rate": 0.00047470475363322223, |
|
"loss": 3.4658, |
|
"step": 14 |
|
}, |
|
{ |
|
"epoch": 0.14893617021276595, |
|
"eval_loss": 4.104072093963623, |
|
"eval_runtime": 97.0883, |
|
"eval_samples_per_second": 3.09, |
|
"eval_steps_per_second": 0.051, |
|
"step": 14 |
|
}, |
|
{ |
|
"epoch": 0.1702127659574468, |
|
"grad_norm": 0.0055496166460216045, |
|
"learning_rate": 0.00047116218084491463, |
|
"loss": 3.4659, |
|
"step": 16 |
|
}, |
|
{ |
|
"epoch": 0.1702127659574468, |
|
"eval_loss": 4.104039669036865, |
|
"eval_runtime": 93.8461, |
|
"eval_samples_per_second": 3.197, |
|
"eval_steps_per_second": 0.053, |
|
"step": 16 |
|
}, |
|
{ |
|
"epoch": 0.19148936170212766, |
|
"grad_norm": 0.035093989223241806, |
|
"learning_rate": 0.000467619608056607, |
|
"loss": 3.4659, |
|
"step": 18 |
|
}, |
|
{ |
|
"epoch": 0.19148936170212766, |
|
"eval_loss": 4.104025363922119, |
|
"eval_runtime": 93.5115, |
|
"eval_samples_per_second": 3.208, |
|
"eval_steps_per_second": 0.053, |
|
"step": 18 |
|
}, |
|
{ |
|
"epoch": 0.2127659574468085, |
|
"grad_norm": 0.013235281221568584, |
|
"learning_rate": 0.0004640770352682993, |
|
"loss": 3.4658, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.2127659574468085, |
|
"eval_loss": 4.1039652824401855, |
|
"eval_runtime": 93.8067, |
|
"eval_samples_per_second": 3.198, |
|
"eval_steps_per_second": 0.053, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.23404255319148937, |
|
"grad_norm": 0.0016526976833119988, |
|
"learning_rate": 0.0004605344624799917, |
|
"loss": 3.4658, |
|
"step": 22 |
|
}, |
|
{ |
|
"epoch": 0.23404255319148937, |
|
"eval_loss": 4.103963375091553, |
|
"eval_runtime": 93.4791, |
|
"eval_samples_per_second": 3.209, |
|
"eval_steps_per_second": 0.053, |
|
"step": 22 |
|
}, |
|
{ |
|
"epoch": 0.2553191489361702, |
|
"grad_norm": 0.0017120653064921498, |
|
"learning_rate": 0.00045699188969168406, |
|
"loss": 3.4658, |
|
"step": 24 |
|
}, |
|
{ |
|
"epoch": 0.2553191489361702, |
|
"eval_loss": 4.103957176208496, |
|
"eval_runtime": 116.2864, |
|
"eval_samples_per_second": 2.58, |
|
"eval_steps_per_second": 0.043, |
|
"step": 24 |
|
}, |
|
{ |
|
"epoch": 0.2765957446808511, |
|
"grad_norm": 0.0011341843055561185, |
|
"learning_rate": 0.00045344931690337645, |
|
"loss": 3.4658, |
|
"step": 26 |
|
}, |
|
{ |
|
"epoch": 0.2765957446808511, |
|
"eval_loss": 4.103954315185547, |
|
"eval_runtime": 96.3574, |
|
"eval_samples_per_second": 3.113, |
|
"eval_steps_per_second": 0.052, |
|
"step": 26 |
|
}, |
|
{ |
|
"epoch": 0.2978723404255319, |
|
"grad_norm": 0.0025584695395082235, |
|
"learning_rate": 0.0004499067441150688, |
|
"loss": 3.4658, |
|
"step": 28 |
|
}, |
|
{ |
|
"epoch": 0.2978723404255319, |
|
"eval_loss": 4.103954792022705, |
|
"eval_runtime": 109.8402, |
|
"eval_samples_per_second": 2.731, |
|
"eval_steps_per_second": 0.046, |
|
"step": 28 |
|
}, |
|
{ |
|
"epoch": 0.3191489361702128, |
|
"grad_norm": 0.0012942380271852016, |
|
"learning_rate": 0.0004463641713267612, |
|
"loss": 3.4658, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.3191489361702128, |
|
"eval_loss": 4.1039557456970215, |
|
"eval_runtime": 114.0952, |
|
"eval_samples_per_second": 2.629, |
|
"eval_steps_per_second": 0.044, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.3404255319148936, |
|
"grad_norm": 0.001233804621733725, |
|
"learning_rate": 0.00044282159853845354, |
|
"loss": 3.4658, |
|
"step": 32 |
|
}, |
|
{ |
|
"epoch": 0.3404255319148936, |
|
"eval_loss": 4.103955268859863, |
|
"eval_runtime": 96.1239, |
|
"eval_samples_per_second": 3.121, |
|
"eval_steps_per_second": 0.052, |
|
"step": 32 |
|
}, |
|
{ |
|
"epoch": 0.3617021276595745, |
|
"grad_norm": 0.0013235628139227629, |
|
"learning_rate": 0.00043927902575014593, |
|
"loss": 3.4658, |
|
"step": 34 |
|
}, |
|
{ |
|
"epoch": 0.3617021276595745, |
|
"eval_loss": 4.103952407836914, |
|
"eval_runtime": 135.012, |
|
"eval_samples_per_second": 2.222, |
|
"eval_steps_per_second": 0.037, |
|
"step": 34 |
|
}, |
|
{ |
|
"epoch": 0.3829787234042553, |
|
"grad_norm": 0.002040724502876401, |
|
"learning_rate": 0.00043573645296183833, |
|
"loss": 3.4658, |
|
"step": 36 |
|
}, |
|
{ |
|
"epoch": 0.3829787234042553, |
|
"eval_loss": 4.103944301605225, |
|
"eval_runtime": 96.6912, |
|
"eval_samples_per_second": 3.103, |
|
"eval_steps_per_second": 0.052, |
|
"step": 36 |
|
}, |
|
{ |
|
"epoch": 0.40425531914893614, |
|
"grad_norm": 0.0005005718558095396, |
|
"learning_rate": 0.00043219388017353067, |
|
"loss": 3.4658, |
|
"step": 38 |
|
}, |
|
{ |
|
"epoch": 0.40425531914893614, |
|
"eval_loss": 4.103936195373535, |
|
"eval_runtime": 120.942, |
|
"eval_samples_per_second": 2.481, |
|
"eval_steps_per_second": 0.041, |
|
"step": 38 |
|
}, |
|
{ |
|
"epoch": 0.425531914893617, |
|
"grad_norm": 0.0004194887587800622, |
|
"learning_rate": 0.00042865130738522307, |
|
"loss": 3.4657, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.425531914893617, |
|
"eval_loss": 4.103933334350586, |
|
"eval_runtime": 165.7391, |
|
"eval_samples_per_second": 1.81, |
|
"eval_steps_per_second": 0.03, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.44680851063829785, |
|
"grad_norm": 0.0007025453960523009, |
|
"learning_rate": 0.0004251087345969154, |
|
"loss": 3.4657, |
|
"step": 42 |
|
}, |
|
{ |
|
"epoch": 0.44680851063829785, |
|
"eval_loss": 4.103935241699219, |
|
"eval_runtime": 95.9616, |
|
"eval_samples_per_second": 3.126, |
|
"eval_steps_per_second": 0.052, |
|
"step": 42 |
|
}, |
|
{ |
|
"epoch": 0.46808510638297873, |
|
"grad_norm": 0.0005715902079828084, |
|
"learning_rate": 0.0004215661618086078, |
|
"loss": 3.4657, |
|
"step": 44 |
|
}, |
|
{ |
|
"epoch": 0.46808510638297873, |
|
"eval_loss": 4.103936195373535, |
|
"eval_runtime": 95.6382, |
|
"eval_samples_per_second": 3.137, |
|
"eval_steps_per_second": 0.052, |
|
"step": 44 |
|
}, |
|
{ |
|
"epoch": 0.48936170212765956, |
|
"grad_norm": 0.0009604791412129998, |
|
"learning_rate": 0.00041802358902030015, |
|
"loss": 3.4657, |
|
"step": 46 |
|
}, |
|
{ |
|
"epoch": 0.48936170212765956, |
|
"eval_loss": 4.103936195373535, |
|
"eval_runtime": 95.422, |
|
"eval_samples_per_second": 3.144, |
|
"eval_steps_per_second": 0.052, |
|
"step": 46 |
|
}, |
|
{ |
|
"epoch": 0.5106382978723404, |
|
"grad_norm": 0.0015922917518764734, |
|
"learning_rate": 0.00041448101623199255, |
|
"loss": 3.4657, |
|
"step": 48 |
|
}, |
|
{ |
|
"epoch": 0.5106382978723404, |
|
"eval_loss": 4.103933334350586, |
|
"eval_runtime": 95.2253, |
|
"eval_samples_per_second": 3.15, |
|
"eval_steps_per_second": 0.053, |
|
"step": 48 |
|
}, |
|
{ |
|
"epoch": 0.5319148936170213, |
|
"grad_norm": 0.00046437734272331, |
|
"learning_rate": 0.0004109384434436849, |
|
"loss": 3.4657, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.5319148936170213, |
|
"eval_loss": 4.103931427001953, |
|
"eval_runtime": 93.7871, |
|
"eval_samples_per_second": 3.199, |
|
"eval_steps_per_second": 0.053, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.5531914893617021, |
|
"grad_norm": 0.0003185276291333139, |
|
"learning_rate": 0.00040739587065537723, |
|
"loss": 3.4657, |
|
"step": 52 |
|
}, |
|
{ |
|
"epoch": 0.5531914893617021, |
|
"eval_loss": 4.103931903839111, |
|
"eval_runtime": 94.4747, |
|
"eval_samples_per_second": 3.175, |
|
"eval_steps_per_second": 0.053, |
|
"step": 52 |
|
}, |
|
{ |
|
"epoch": 0.574468085106383, |
|
"grad_norm": 0.0005938044050708413, |
|
"learning_rate": 0.00040385329786706963, |
|
"loss": 3.4657, |
|
"step": 54 |
|
}, |
|
{ |
|
"epoch": 0.574468085106383, |
|
"eval_loss": 4.103933334350586, |
|
"eval_runtime": 94.4693, |
|
"eval_samples_per_second": 3.176, |
|
"eval_steps_per_second": 0.053, |
|
"step": 54 |
|
}, |
|
{ |
|
"epoch": 0.5957446808510638, |
|
"grad_norm": 0.0008093913784250617, |
|
"learning_rate": 0.000400310725078762, |
|
"loss": 3.4657, |
|
"step": 56 |
|
}, |
|
{ |
|
"epoch": 0.5957446808510638, |
|
"eval_loss": 4.103933334350586, |
|
"eval_runtime": 93.6969, |
|
"eval_samples_per_second": 3.202, |
|
"eval_steps_per_second": 0.053, |
|
"step": 56 |
|
}, |
|
{ |
|
"epoch": 0.6170212765957447, |
|
"grad_norm": 0.000580158899538219, |
|
"learning_rate": 0.0003967681522904544, |
|
"loss": 3.4657, |
|
"step": 58 |
|
}, |
|
{ |
|
"epoch": 0.6170212765957447, |
|
"eval_loss": 4.103932857513428, |
|
"eval_runtime": 94.0277, |
|
"eval_samples_per_second": 3.191, |
|
"eval_steps_per_second": 0.053, |
|
"step": 58 |
|
}, |
|
{ |
|
"epoch": 0.6382978723404256, |
|
"grad_norm": 0.0005565917235799134, |
|
"learning_rate": 0.00039322557950214677, |
|
"loss": 3.4657, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.6382978723404256, |
|
"eval_loss": 4.103931903839111, |
|
"eval_runtime": 107.4438, |
|
"eval_samples_per_second": 2.792, |
|
"eval_steps_per_second": 0.047, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.6595744680851063, |
|
"grad_norm": 0.0006027113413438201, |
|
"learning_rate": 0.00038968300671383916, |
|
"loss": 3.4657, |
|
"step": 62 |
|
}, |
|
{ |
|
"epoch": 0.6595744680851063, |
|
"eval_loss": 4.103930473327637, |
|
"eval_runtime": 107.2976, |
|
"eval_samples_per_second": 2.796, |
|
"eval_steps_per_second": 0.047, |
|
"step": 62 |
|
}, |
|
{ |
|
"epoch": 0.6808510638297872, |
|
"grad_norm": 0.0002680857141967863, |
|
"learning_rate": 0.0003861404339255315, |
|
"loss": 3.4657, |
|
"step": 64 |
|
}, |
|
{ |
|
"epoch": 0.6808510638297872, |
|
"eval_loss": 4.103930473327637, |
|
"eval_runtime": 94.4753, |
|
"eval_samples_per_second": 3.175, |
|
"eval_steps_per_second": 0.053, |
|
"step": 64 |
|
}, |
|
{ |
|
"epoch": 0.7021276595744681, |
|
"grad_norm": 0.0003917052235919982, |
|
"learning_rate": 0.0003825978611372239, |
|
"loss": 3.4657, |
|
"step": 66 |
|
}, |
|
{ |
|
"epoch": 0.7021276595744681, |
|
"eval_loss": 4.103930473327637, |
|
"eval_runtime": 94.4775, |
|
"eval_samples_per_second": 3.175, |
|
"eval_steps_per_second": 0.053, |
|
"step": 66 |
|
}, |
|
{ |
|
"epoch": 0.723404255319149, |
|
"grad_norm": 0.00032164924778044224, |
|
"learning_rate": 0.00037905528834891625, |
|
"loss": 3.4657, |
|
"step": 68 |
|
}, |
|
{ |
|
"epoch": 0.723404255319149, |
|
"eval_loss": 4.103930473327637, |
|
"eval_runtime": 94.9268, |
|
"eval_samples_per_second": 3.16, |
|
"eval_steps_per_second": 0.053, |
|
"step": 68 |
|
}, |
|
{ |
|
"epoch": 0.7446808510638298, |
|
"grad_norm": 0.0004127752035856247, |
|
"learning_rate": 0.0003755127155606086, |
|
"loss": 3.4657, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.7446808510638298, |
|
"eval_loss": 4.103930473327637, |
|
"eval_runtime": 95.0978, |
|
"eval_samples_per_second": 3.155, |
|
"eval_steps_per_second": 0.053, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.7659574468085106, |
|
"grad_norm": 0.00039929302874952555, |
|
"learning_rate": 0.000371970142772301, |
|
"loss": 3.4657, |
|
"step": 72 |
|
}, |
|
{ |
|
"epoch": 0.7659574468085106, |
|
"eval_loss": 4.1039299964904785, |
|
"eval_runtime": 100.3564, |
|
"eval_samples_per_second": 2.989, |
|
"eval_steps_per_second": 0.05, |
|
"step": 72 |
|
}, |
|
{ |
|
"epoch": 0.7872340425531915, |
|
"grad_norm": 0.00019805252668447793, |
|
"learning_rate": 0.00036842756998399333, |
|
"loss": 3.4657, |
|
"step": 74 |
|
}, |
|
{ |
|
"epoch": 0.7872340425531915, |
|
"eval_loss": 4.1039299964904785, |
|
"eval_runtime": 93.9211, |
|
"eval_samples_per_second": 3.194, |
|
"eval_steps_per_second": 0.053, |
|
"step": 74 |
|
}, |
|
{ |
|
"epoch": 0.8085106382978723, |
|
"grad_norm": 0.00028355230460874736, |
|
"learning_rate": 0.0003648849971956858, |
|
"loss": 3.4657, |
|
"step": 76 |
|
}, |
|
{ |
|
"epoch": 0.8085106382978723, |
|
"eval_loss": 4.1039299964904785, |
|
"eval_runtime": 122.794, |
|
"eval_samples_per_second": 2.443, |
|
"eval_steps_per_second": 0.041, |
|
"step": 76 |
|
}, |
|
{ |
|
"epoch": 0.8297872340425532, |
|
"grad_norm": 0.0003330525360070169, |
|
"learning_rate": 0.0003613424244073781, |
|
"loss": 3.4657, |
|
"step": 78 |
|
}, |
|
{ |
|
"epoch": 0.8297872340425532, |
|
"eval_loss": 4.1039299964904785, |
|
"eval_runtime": 108.2578, |
|
"eval_samples_per_second": 2.771, |
|
"eval_steps_per_second": 0.046, |
|
"step": 78 |
|
}, |
|
{ |
|
"epoch": 0.851063829787234, |
|
"grad_norm": 0.00028144015232101083, |
|
"learning_rate": 0.0003577998516190705, |
|
"loss": 3.4657, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.851063829787234, |
|
"eval_loss": 4.10392951965332, |
|
"eval_runtime": 93.7754, |
|
"eval_samples_per_second": 3.199, |
|
"eval_steps_per_second": 0.053, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.8723404255319149, |
|
"grad_norm": 0.0003809813060797751, |
|
"learning_rate": 0.00035425727883076286, |
|
"loss": 3.4657, |
|
"step": 82 |
|
}, |
|
{ |
|
"epoch": 0.8723404255319149, |
|
"eval_loss": 4.1039299964904785, |
|
"eval_runtime": 104.9248, |
|
"eval_samples_per_second": 2.859, |
|
"eval_steps_per_second": 0.048, |
|
"step": 82 |
|
}, |
|
{ |
|
"epoch": 0.8936170212765957, |
|
"grad_norm": 0.0005144781316630542, |
|
"learning_rate": 0.0003507147060424552, |
|
"loss": 3.4657, |
|
"step": 84 |
|
}, |
|
{ |
|
"epoch": 0.8936170212765957, |
|
"eval_loss": 4.1039299964904785, |
|
"eval_runtime": 94.2337, |
|
"eval_samples_per_second": 3.184, |
|
"eval_steps_per_second": 0.053, |
|
"step": 84 |
|
}, |
|
{ |
|
"epoch": 0.9148936170212766, |
|
"grad_norm": 0.0004984396509826183, |
|
"learning_rate": 0.0003471721332541476, |
|
"loss": 3.4657, |
|
"step": 86 |
|
}, |
|
{ |
|
"epoch": 0.9148936170212766, |
|
"eval_loss": 4.103929042816162, |
|
"eval_runtime": 93.8643, |
|
"eval_samples_per_second": 3.196, |
|
"eval_steps_per_second": 0.053, |
|
"step": 86 |
|
}, |
|
{ |
|
"epoch": 0.9361702127659575, |
|
"grad_norm": 0.00022737662948202342, |
|
"learning_rate": 0.00034362956046583994, |
|
"loss": 3.4657, |
|
"step": 88 |
|
}, |
|
{ |
|
"epoch": 0.9361702127659575, |
|
"eval_loss": 4.1039299964904785, |
|
"eval_runtime": 135.1229, |
|
"eval_samples_per_second": 2.22, |
|
"eval_steps_per_second": 0.037, |
|
"step": 88 |
|
}, |
|
{ |
|
"epoch": 0.9574468085106383, |
|
"grad_norm": 0.0003345514414831996, |
|
"learning_rate": 0.00034008698767753234, |
|
"loss": 3.4657, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.9574468085106383, |
|
"eval_loss": 4.1039299964904785, |
|
"eval_runtime": 141.4817, |
|
"eval_samples_per_second": 2.12, |
|
"eval_steps_per_second": 0.035, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.9787234042553191, |
|
"grad_norm": 0.00027788232546299696, |
|
"learning_rate": 0.0003365444148892247, |
|
"loss": 3.4657, |
|
"step": 92 |
|
}, |
|
{ |
|
"epoch": 0.9787234042553191, |
|
"eval_loss": 4.10392951965332, |
|
"eval_runtime": 116.6901, |
|
"eval_samples_per_second": 2.571, |
|
"eval_steps_per_second": 0.043, |
|
"step": 92 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"grad_norm": 0.00033105359761975706, |
|
"learning_rate": 0.0003330018421009171, |
|
"loss": 3.3006, |
|
"step": 94 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"eval_loss": 4.103929042816162, |
|
"eval_runtime": 149.9164, |
|
"eval_samples_per_second": 2.001, |
|
"eval_steps_per_second": 0.033, |
|
"step": 94 |
|
}, |
|
{ |
|
"epoch": 1.0212765957446808, |
|
"grad_norm": 0.0003272149770054966, |
|
"learning_rate": 0.0003294592693126094, |
|
"loss": 3.4657, |
|
"step": 96 |
|
}, |
|
{ |
|
"epoch": 1.0212765957446808, |
|
"eval_loss": 4.103928565979004, |
|
"eval_runtime": 117.8922, |
|
"eval_samples_per_second": 2.545, |
|
"eval_steps_per_second": 0.042, |
|
"step": 96 |
|
}, |
|
{ |
|
"epoch": 1.0425531914893618, |
|
"grad_norm": 0.0002339623897569254, |
|
"learning_rate": 0.0003259166965243019, |
|
"loss": 3.4657, |
|
"step": 98 |
|
}, |
|
{ |
|
"epoch": 1.0425531914893618, |
|
"eval_loss": 4.10392951965332, |
|
"eval_runtime": 94.5849, |
|
"eval_samples_per_second": 3.172, |
|
"eval_steps_per_second": 0.053, |
|
"step": 98 |
|
}, |
|
{ |
|
"epoch": 1.0638297872340425, |
|
"grad_norm": 0.0003783780266530812, |
|
"learning_rate": 0.0003223741237359942, |
|
"loss": 3.4657, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 1.0638297872340425, |
|
"eval_loss": 4.103929042816162, |
|
"eval_runtime": 117.3215, |
|
"eval_samples_per_second": 2.557, |
|
"eval_steps_per_second": 0.043, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 1.0851063829787233, |
|
"grad_norm": 0.00021044367167633027, |
|
"learning_rate": 0.00031883155094768656, |
|
"loss": 3.4657, |
|
"step": 102 |
|
}, |
|
{ |
|
"epoch": 1.0851063829787233, |
|
"eval_loss": 4.103929042816162, |
|
"eval_runtime": 118.2354, |
|
"eval_samples_per_second": 2.537, |
|
"eval_steps_per_second": 0.042, |
|
"step": 102 |
|
}, |
|
{ |
|
"epoch": 1.1063829787234043, |
|
"grad_norm": 0.00032288962393067777, |
|
"learning_rate": 0.00031528897815937896, |
|
"loss": 3.4657, |
|
"step": 104 |
|
}, |
|
{ |
|
"epoch": 1.1063829787234043, |
|
"eval_loss": 4.103928089141846, |
|
"eval_runtime": 111.5984, |
|
"eval_samples_per_second": 2.688, |
|
"eval_steps_per_second": 0.045, |
|
"step": 104 |
|
}, |
|
{ |
|
"epoch": 1.127659574468085, |
|
"grad_norm": 0.0002670014800969511, |
|
"learning_rate": 0.0003117464053710713, |
|
"loss": 3.4657, |
|
"step": 106 |
|
}, |
|
{ |
|
"epoch": 1.127659574468085, |
|
"eval_loss": 4.103928565979004, |
|
"eval_runtime": 121.2363, |
|
"eval_samples_per_second": 2.475, |
|
"eval_steps_per_second": 0.041, |
|
"step": 106 |
|
}, |
|
{ |
|
"epoch": 1.148936170212766, |
|
"grad_norm": 0.00021224924421403557, |
|
"learning_rate": 0.0003082038325827637, |
|
"loss": 3.4657, |
|
"step": 108 |
|
}, |
|
{ |
|
"epoch": 1.148936170212766, |
|
"eval_loss": 4.103928089141846, |
|
"eval_runtime": 102.1198, |
|
"eval_samples_per_second": 2.938, |
|
"eval_steps_per_second": 0.049, |
|
"step": 108 |
|
}, |
|
{ |
|
"epoch": 1.1702127659574468, |
|
"grad_norm": 0.0002756421163212508, |
|
"learning_rate": 0.00030466125979445604, |
|
"loss": 3.4657, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 1.1702127659574468, |
|
"eval_loss": 4.103928565979004, |
|
"eval_runtime": 121.4773, |
|
"eval_samples_per_second": 2.47, |
|
"eval_steps_per_second": 0.041, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 1.1914893617021276, |
|
"grad_norm": 0.0002671127731446177, |
|
"learning_rate": 0.00030111868700614843, |
|
"loss": 3.4657, |
|
"step": 112 |
|
}, |
|
{ |
|
"epoch": 1.1914893617021276, |
|
"eval_loss": 4.103928565979004, |
|
"eval_runtime": 138.1466, |
|
"eval_samples_per_second": 2.172, |
|
"eval_steps_per_second": 0.036, |
|
"step": 112 |
|
}, |
|
{ |
|
"epoch": 1.2127659574468086, |
|
"grad_norm": 0.0004021762579213828, |
|
"learning_rate": 0.0002975761142178408, |
|
"loss": 3.4657, |
|
"step": 114 |
|
}, |
|
{ |
|
"epoch": 1.2127659574468086, |
|
"eval_loss": 4.103928089141846, |
|
"eval_runtime": 101.5548, |
|
"eval_samples_per_second": 2.954, |
|
"eval_steps_per_second": 0.049, |
|
"step": 114 |
|
}, |
|
{ |
|
"epoch": 1.2340425531914894, |
|
"grad_norm": 0.0002796368789859116, |
|
"learning_rate": 0.0002940335414295331, |
|
"loss": 3.4657, |
|
"step": 116 |
|
}, |
|
{ |
|
"epoch": 1.2340425531914894, |
|
"eval_loss": 4.103928089141846, |
|
"eval_runtime": 94.7269, |
|
"eval_samples_per_second": 3.167, |
|
"eval_steps_per_second": 0.053, |
|
"step": 116 |
|
}, |
|
{ |
|
"epoch": 1.2553191489361701, |
|
"grad_norm": 0.0003006465267390013, |
|
"learning_rate": 0.0002904909686412255, |
|
"loss": 3.4657, |
|
"step": 118 |
|
}, |
|
{ |
|
"epoch": 1.2553191489361701, |
|
"eval_loss": 4.103928089141846, |
|
"eval_runtime": 104.4724, |
|
"eval_samples_per_second": 2.872, |
|
"eval_steps_per_second": 0.048, |
|
"step": 118 |
|
}, |
|
{ |
|
"epoch": 1.2765957446808511, |
|
"grad_norm": 0.00029228252242319286, |
|
"learning_rate": 0.0002869483958529179, |
|
"loss": 3.4657, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 1.2765957446808511, |
|
"eval_loss": 4.103928565979004, |
|
"eval_runtime": 108.4499, |
|
"eval_samples_per_second": 2.766, |
|
"eval_steps_per_second": 0.046, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 1.297872340425532, |
|
"grad_norm": 0.00032636514515616, |
|
"learning_rate": 0.0002834058230646103, |
|
"loss": 3.4657, |
|
"step": 122 |
|
}, |
|
{ |
|
"epoch": 1.297872340425532, |
|
"eval_loss": 4.1039276123046875, |
|
"eval_runtime": 94.7255, |
|
"eval_samples_per_second": 3.167, |
|
"eval_steps_per_second": 0.053, |
|
"step": 122 |
|
}, |
|
{ |
|
"epoch": 1.3191489361702127, |
|
"grad_norm": 0.00040858419379219413, |
|
"learning_rate": 0.00027986325027630265, |
|
"loss": 3.4657, |
|
"step": 124 |
|
}, |
|
{ |
|
"epoch": 1.3191489361702127, |
|
"eval_loss": 4.103928089141846, |
|
"eval_runtime": 123.8424, |
|
"eval_samples_per_second": 2.422, |
|
"eval_steps_per_second": 0.04, |
|
"step": 124 |
|
}, |
|
{ |
|
"epoch": 1.3404255319148937, |
|
"grad_norm": 0.00040357012767344713, |
|
"learning_rate": 0.00027632067748799505, |
|
"loss": 3.4657, |
|
"step": 126 |
|
}, |
|
{ |
|
"epoch": 1.3404255319148937, |
|
"eval_loss": 4.103928089141846, |
|
"eval_runtime": 118.9281, |
|
"eval_samples_per_second": 2.523, |
|
"eval_steps_per_second": 0.042, |
|
"step": 126 |
|
}, |
|
{ |
|
"epoch": 1.3617021276595744, |
|
"grad_norm": 0.00039930595085024834, |
|
"learning_rate": 0.0002727781046996874, |
|
"loss": 3.4657, |
|
"step": 128 |
|
}, |
|
{ |
|
"epoch": 1.3617021276595744, |
|
"eval_loss": 4.1039276123046875, |
|
"eval_runtime": 112.1697, |
|
"eval_samples_per_second": 2.675, |
|
"eval_steps_per_second": 0.045, |
|
"step": 128 |
|
}, |
|
{ |
|
"epoch": 1.3829787234042552, |
|
"grad_norm": 0.00037575900205411017, |
|
"learning_rate": 0.0002692355319113798, |
|
"loss": 3.4657, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 1.3829787234042552, |
|
"eval_loss": 4.1039276123046875, |
|
"eval_runtime": 125.21, |
|
"eval_samples_per_second": 2.396, |
|
"eval_steps_per_second": 0.04, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 1.4042553191489362, |
|
"grad_norm": 0.00047126636491157115, |
|
"learning_rate": 0.00026569295912307213, |
|
"loss": 3.4657, |
|
"step": 132 |
|
}, |
|
{ |
|
"epoch": 1.4042553191489362, |
|
"eval_loss": 4.1039276123046875, |
|
"eval_runtime": 135.9147, |
|
"eval_samples_per_second": 2.207, |
|
"eval_steps_per_second": 0.037, |
|
"step": 132 |
|
}, |
|
{ |
|
"epoch": 1.425531914893617, |
|
"grad_norm": 0.0005009864689782262, |
|
"learning_rate": 0.0002621503863347645, |
|
"loss": 3.4657, |
|
"step": 134 |
|
}, |
|
{ |
|
"epoch": 1.425531914893617, |
|
"eval_loss": 4.1039276123046875, |
|
"eval_runtime": 107.4624, |
|
"eval_samples_per_second": 2.792, |
|
"eval_steps_per_second": 0.047, |
|
"step": 134 |
|
}, |
|
{ |
|
"epoch": 1.4468085106382977, |
|
"grad_norm": 0.0004678282712120563, |
|
"learning_rate": 0.00025860781354645687, |
|
"loss": 3.4657, |
|
"step": 136 |
|
}, |
|
{ |
|
"epoch": 1.4468085106382977, |
|
"eval_loss": 4.1039276123046875, |
|
"eval_runtime": 138.7278, |
|
"eval_samples_per_second": 2.163, |
|
"eval_steps_per_second": 0.036, |
|
"step": 136 |
|
}, |
|
{ |
|
"epoch": 1.4680851063829787, |
|
"grad_norm": 0.0003415594110265374, |
|
"learning_rate": 0.0002550652407581492, |
|
"loss": 3.4657, |
|
"step": 138 |
|
}, |
|
{ |
|
"epoch": 1.4680851063829787, |
|
"eval_loss": 4.1039276123046875, |
|
"eval_runtime": 130.873, |
|
"eval_samples_per_second": 2.292, |
|
"eval_steps_per_second": 0.038, |
|
"step": 138 |
|
}, |
|
{ |
|
"epoch": 1.4893617021276595, |
|
"grad_norm": 0.00039781673694960773, |
|
"learning_rate": 0.0002515226679698416, |
|
"loss": 3.4657, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 1.4893617021276595, |
|
"eval_loss": 4.1039276123046875, |
|
"eval_runtime": 127.3509, |
|
"eval_samples_per_second": 2.356, |
|
"eval_steps_per_second": 0.039, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 1.5106382978723403, |
|
"grad_norm": 0.0007758048013783991, |
|
"learning_rate": 0.000247980095181534, |
|
"loss": 3.4657, |
|
"step": 142 |
|
}, |
|
{ |
|
"epoch": 1.5106382978723403, |
|
"eval_loss": 4.103927135467529, |
|
"eval_runtime": 93.1203, |
|
"eval_samples_per_second": 3.222, |
|
"eval_steps_per_second": 0.054, |
|
"step": 142 |
|
}, |
|
{ |
|
"epoch": 1.5319148936170213, |
|
"grad_norm": 0.0003962689661420882, |
|
"learning_rate": 0.00024443752239322635, |
|
"loss": 3.4657, |
|
"step": 144 |
|
}, |
|
{ |
|
"epoch": 1.5319148936170213, |
|
"eval_loss": 4.103927135467529, |
|
"eval_runtime": 92.9702, |
|
"eval_samples_per_second": 3.227, |
|
"eval_steps_per_second": 0.054, |
|
"step": 144 |
|
}, |
|
{ |
|
"epoch": 1.5531914893617023, |
|
"grad_norm": 0.00046265136916190386, |
|
"learning_rate": 0.00024089494960491872, |
|
"loss": 3.4657, |
|
"step": 146 |
|
}, |
|
{ |
|
"epoch": 1.5531914893617023, |
|
"eval_loss": 4.103926658630371, |
|
"eval_runtime": 95.1623, |
|
"eval_samples_per_second": 3.153, |
|
"eval_steps_per_second": 0.053, |
|
"step": 146 |
|
}, |
|
{ |
|
"epoch": 1.574468085106383, |
|
"grad_norm": 0.0003199471684638411, |
|
"learning_rate": 0.00023735237681661112, |
|
"loss": 3.4657, |
|
"step": 148 |
|
}, |
|
{ |
|
"epoch": 1.574468085106383, |
|
"eval_loss": 4.103927135467529, |
|
"eval_runtime": 94.8304, |
|
"eval_samples_per_second": 3.164, |
|
"eval_steps_per_second": 0.053, |
|
"step": 148 |
|
}, |
|
{ |
|
"epoch": 1.5957446808510638, |
|
"grad_norm": 0.0009507798822596669, |
|
"learning_rate": 0.0002338098040283035, |
|
"loss": 3.4657, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 1.5957446808510638, |
|
"eval_loss": 4.103926658630371, |
|
"eval_runtime": 106.393, |
|
"eval_samples_per_second": 2.82, |
|
"eval_steps_per_second": 0.047, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 1.6170212765957448, |
|
"grad_norm": 0.0003600665950216353, |
|
"learning_rate": 0.00023026723123999586, |
|
"loss": 3.4657, |
|
"step": 152 |
|
}, |
|
{ |
|
"epoch": 1.6170212765957448, |
|
"eval_loss": 4.103927135467529, |
|
"eval_runtime": 93.6609, |
|
"eval_samples_per_second": 3.203, |
|
"eval_steps_per_second": 0.053, |
|
"step": 152 |
|
}, |
|
{ |
|
"epoch": 1.6382978723404256, |
|
"grad_norm": 0.00046105613000690937, |
|
"learning_rate": 0.00022672465845168823, |
|
"loss": 3.4657, |
|
"step": 154 |
|
}, |
|
{ |
|
"epoch": 1.6382978723404256, |
|
"eval_loss": 4.103926658630371, |
|
"eval_runtime": 93.7233, |
|
"eval_samples_per_second": 3.201, |
|
"eval_steps_per_second": 0.053, |
|
"step": 154 |
|
}, |
|
{ |
|
"epoch": 1.6595744680851063, |
|
"grad_norm": 0.0004165441496297717, |
|
"learning_rate": 0.0002231820856633806, |
|
"loss": 3.4657, |
|
"step": 156 |
|
}, |
|
{ |
|
"epoch": 1.6595744680851063, |
|
"eval_loss": 4.103926658630371, |
|
"eval_runtime": 93.1558, |
|
"eval_samples_per_second": 3.22, |
|
"eval_steps_per_second": 0.054, |
|
"step": 156 |
|
}, |
|
{ |
|
"epoch": 1.6808510638297873, |
|
"grad_norm": 0.00046332846977747977, |
|
"learning_rate": 0.00021963951287507297, |
|
"loss": 3.4657, |
|
"step": 158 |
|
}, |
|
{ |
|
"epoch": 1.6808510638297873, |
|
"eval_loss": 4.103925704956055, |
|
"eval_runtime": 92.8599, |
|
"eval_samples_per_second": 3.231, |
|
"eval_steps_per_second": 0.054, |
|
"step": 158 |
|
}, |
|
{ |
|
"epoch": 1.702127659574468, |
|
"grad_norm": 0.00045260830665938556, |
|
"learning_rate": 0.00021609694008676534, |
|
"loss": 3.4657, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 1.702127659574468, |
|
"eval_loss": 4.103925704956055, |
|
"eval_runtime": 92.4335, |
|
"eval_samples_per_second": 3.246, |
|
"eval_steps_per_second": 0.054, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 1.7234042553191489, |
|
"grad_norm": 0.0006771318148821592, |
|
"learning_rate": 0.0002125543672984577, |
|
"loss": 3.4657, |
|
"step": 162 |
|
}, |
|
{ |
|
"epoch": 1.7234042553191489, |
|
"eval_loss": 4.1039252281188965, |
|
"eval_runtime": 92.8141, |
|
"eval_samples_per_second": 3.232, |
|
"eval_steps_per_second": 0.054, |
|
"step": 162 |
|
}, |
|
{ |
|
"epoch": 1.7446808510638299, |
|
"grad_norm": 0.0002604821929708123, |
|
"learning_rate": 0.00020901179451015008, |
|
"loss": 3.4657, |
|
"step": 164 |
|
}, |
|
{ |
|
"epoch": 1.7446808510638299, |
|
"eval_loss": 4.1039252281188965, |
|
"eval_runtime": 93.7459, |
|
"eval_samples_per_second": 3.2, |
|
"eval_steps_per_second": 0.053, |
|
"step": 164 |
|
}, |
|
{ |
|
"epoch": 1.7659574468085106, |
|
"grad_norm": 0.0006150390254333615, |
|
"learning_rate": 0.00020546922172184245, |
|
"loss": 3.4657, |
|
"step": 166 |
|
}, |
|
{ |
|
"epoch": 1.7659574468085106, |
|
"eval_loss": 4.103924751281738, |
|
"eval_runtime": 94.135, |
|
"eval_samples_per_second": 3.187, |
|
"eval_steps_per_second": 0.053, |
|
"step": 166 |
|
}, |
|
{ |
|
"epoch": 1.7872340425531914, |
|
"grad_norm": 0.0006115163560025394, |
|
"learning_rate": 0.00020192664893353482, |
|
"loss": 3.4657, |
|
"step": 168 |
|
}, |
|
{ |
|
"epoch": 1.7872340425531914, |
|
"eval_loss": 4.10392427444458, |
|
"eval_runtime": 93.8446, |
|
"eval_samples_per_second": 3.197, |
|
"eval_steps_per_second": 0.053, |
|
"step": 168 |
|
}, |
|
{ |
|
"epoch": 1.8085106382978724, |
|
"grad_norm": 0.000593107077293098, |
|
"learning_rate": 0.0001983840761452272, |
|
"loss": 3.4657, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 1.8085106382978724, |
|
"eval_loss": 4.10392427444458, |
|
"eval_runtime": 92.9188, |
|
"eval_samples_per_second": 3.229, |
|
"eval_steps_per_second": 0.054, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 1.8297872340425532, |
|
"grad_norm": 0.0008810166036710143, |
|
"learning_rate": 0.00019484150335691958, |
|
"loss": 3.4657, |
|
"step": 172 |
|
}, |
|
{ |
|
"epoch": 1.8297872340425532, |
|
"eval_loss": 4.103923320770264, |
|
"eval_runtime": 93.7813, |
|
"eval_samples_per_second": 3.199, |
|
"eval_steps_per_second": 0.053, |
|
"step": 172 |
|
}, |
|
{ |
|
"epoch": 1.851063829787234, |
|
"grad_norm": 0.0006031219381839037, |
|
"learning_rate": 0.00019129893056861195, |
|
"loss": 3.4657, |
|
"step": 174 |
|
}, |
|
{ |
|
"epoch": 1.851063829787234, |
|
"eval_loss": 4.103923320770264, |
|
"eval_runtime": 94.6061, |
|
"eval_samples_per_second": 3.171, |
|
"eval_steps_per_second": 0.053, |
|
"step": 174 |
|
}, |
|
{ |
|
"epoch": 1.872340425531915, |
|
"grad_norm": 0.000731271633412689, |
|
"learning_rate": 0.0001877563577803043, |
|
"loss": 3.4657, |
|
"step": 176 |
|
}, |
|
{ |
|
"epoch": 1.872340425531915, |
|
"eval_loss": 4.103922367095947, |
|
"eval_runtime": 93.6123, |
|
"eval_samples_per_second": 3.205, |
|
"eval_steps_per_second": 0.053, |
|
"step": 176 |
|
}, |
|
{ |
|
"epoch": 1.8936170212765957, |
|
"grad_norm": 0.0005839240038767457, |
|
"learning_rate": 0.00018421378499199666, |
|
"loss": 3.4657, |
|
"step": 178 |
|
}, |
|
{ |
|
"epoch": 1.8936170212765957, |
|
"eval_loss": 4.103921890258789, |
|
"eval_runtime": 93.4167, |
|
"eval_samples_per_second": 3.211, |
|
"eval_steps_per_second": 0.054, |
|
"step": 178 |
|
}, |
|
{ |
|
"epoch": 1.9148936170212765, |
|
"grad_norm": 0.0006142465863376856, |
|
"learning_rate": 0.00018067121220368906, |
|
"loss": 3.4657, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 1.9148936170212765, |
|
"eval_loss": 4.103921890258789, |
|
"eval_runtime": 93.1077, |
|
"eval_samples_per_second": 3.222, |
|
"eval_steps_per_second": 0.054, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 1.9361702127659575, |
|
"grad_norm": 0.0005157635896466672, |
|
"learning_rate": 0.00017712863941538143, |
|
"loss": 3.4657, |
|
"step": 182 |
|
}, |
|
{ |
|
"epoch": 1.9361702127659575, |
|
"eval_loss": 4.103921890258789, |
|
"eval_runtime": 104.9418, |
|
"eval_samples_per_second": 2.859, |
|
"eval_steps_per_second": 0.048, |
|
"step": 182 |
|
}, |
|
{ |
|
"epoch": 1.9574468085106385, |
|
"grad_norm": 0.00047715185792185366, |
|
"learning_rate": 0.0001735860666270738, |
|
"loss": 3.4657, |
|
"step": 184 |
|
}, |
|
{ |
|
"epoch": 1.9574468085106385, |
|
"eval_loss": 4.103921890258789, |
|
"eval_runtime": 143.8492, |
|
"eval_samples_per_second": 2.086, |
|
"eval_steps_per_second": 0.035, |
|
"step": 184 |
|
}, |
|
{ |
|
"epoch": 1.978723404255319, |
|
"grad_norm": 0.0006548584206029773, |
|
"learning_rate": 0.00017004349383876617, |
|
"loss": 3.4657, |
|
"step": 186 |
|
}, |
|
{ |
|
"epoch": 1.978723404255319, |
|
"eval_loss": 4.103920936584473, |
|
"eval_runtime": 93.0081, |
|
"eval_samples_per_second": 3.226, |
|
"eval_steps_per_second": 0.054, |
|
"step": 186 |
|
}, |
|
{ |
|
"epoch": 2.0, |
|
"grad_norm": 0.0007490716525353491, |
|
"learning_rate": 0.00016650092105045854, |
|
"loss": 3.3006, |
|
"step": 188 |
|
}, |
|
{ |
|
"epoch": 2.0, |
|
"eval_loss": 4.103919982910156, |
|
"eval_runtime": 93.8488, |
|
"eval_samples_per_second": 3.197, |
|
"eval_steps_per_second": 0.053, |
|
"step": 188 |
|
}, |
|
{ |
|
"epoch": 2.021276595744681, |
|
"grad_norm": 0.001015963382087648, |
|
"learning_rate": 0.00016295834826215094, |
|
"loss": 3.4657, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 2.021276595744681, |
|
"eval_loss": 4.103919506072998, |
|
"eval_runtime": 93.9004, |
|
"eval_samples_per_second": 3.195, |
|
"eval_steps_per_second": 0.053, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 2.0425531914893615, |
|
"grad_norm": 0.004860539920628071, |
|
"learning_rate": 0.00015941577547384328, |
|
"loss": 3.4657, |
|
"step": 192 |
|
}, |
|
{ |
|
"epoch": 2.0425531914893615, |
|
"eval_loss": 4.10391902923584, |
|
"eval_runtime": 105.0138, |
|
"eval_samples_per_second": 2.857, |
|
"eval_steps_per_second": 0.048, |
|
"step": 192 |
|
}, |
|
{ |
|
"epoch": 2.0638297872340425, |
|
"grad_norm": 0.0015178662724792957, |
|
"learning_rate": 0.00015587320268553565, |
|
"loss": 3.4657, |
|
"step": 194 |
|
}, |
|
{ |
|
"epoch": 2.0638297872340425, |
|
"eval_loss": 4.103918552398682, |
|
"eval_runtime": 94.2786, |
|
"eval_samples_per_second": 3.182, |
|
"eval_steps_per_second": 0.053, |
|
"step": 194 |
|
}, |
|
{ |
|
"epoch": 2.0851063829787235, |
|
"grad_norm": 0.0009718618239276111, |
|
"learning_rate": 0.00015233062989722802, |
|
"loss": 3.4657, |
|
"step": 196 |
|
}, |
|
{ |
|
"epoch": 2.0851063829787235, |
|
"eval_loss": 4.103917598724365, |
|
"eval_runtime": 93.9852, |
|
"eval_samples_per_second": 3.192, |
|
"eval_steps_per_second": 0.053, |
|
"step": 196 |
|
}, |
|
{ |
|
"epoch": 2.106382978723404, |
|
"grad_norm": 0.0011399408103898168, |
|
"learning_rate": 0.0001487880571089204, |
|
"loss": 3.4657, |
|
"step": 198 |
|
}, |
|
{ |
|
"epoch": 2.106382978723404, |
|
"eval_loss": 4.103916645050049, |
|
"eval_runtime": 151.4173, |
|
"eval_samples_per_second": 1.981, |
|
"eval_steps_per_second": 0.033, |
|
"step": 198 |
|
}, |
|
{ |
|
"epoch": 2.127659574468085, |
|
"grad_norm": 0.0005256883450783789, |
|
"learning_rate": 0.00014524548432061276, |
|
"loss": 3.4657, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 2.127659574468085, |
|
"eval_loss": 4.103916168212891, |
|
"eval_runtime": 102.8677, |
|
"eval_samples_per_second": 2.916, |
|
"eval_steps_per_second": 0.049, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 2.148936170212766, |
|
"grad_norm": 0.0009113452979363501, |
|
"learning_rate": 0.00014170291153230516, |
|
"loss": 3.4657, |
|
"step": 202 |
|
}, |
|
{ |
|
"epoch": 2.148936170212766, |
|
"eval_loss": 4.103915214538574, |
|
"eval_runtime": 111.9189, |
|
"eval_samples_per_second": 2.681, |
|
"eval_steps_per_second": 0.045, |
|
"step": 202 |
|
}, |
|
{ |
|
"epoch": 2.1702127659574466, |
|
"grad_norm": 0.0010462055215612054, |
|
"learning_rate": 0.00013816033874399752, |
|
"loss": 3.4657, |
|
"step": 204 |
|
}, |
|
{ |
|
"epoch": 2.1702127659574466, |
|
"eval_loss": 4.103914260864258, |
|
"eval_runtime": 126.7351, |
|
"eval_samples_per_second": 2.367, |
|
"eval_steps_per_second": 0.039, |
|
"step": 204 |
|
}, |
|
{ |
|
"epoch": 2.1914893617021276, |
|
"grad_norm": 0.0008617418352514505, |
|
"learning_rate": 0.0001346177659556899, |
|
"loss": 3.4657, |
|
"step": 206 |
|
}, |
|
{ |
|
"epoch": 2.1914893617021276, |
|
"eval_loss": 4.1039137840271, |
|
"eval_runtime": 124.4636, |
|
"eval_samples_per_second": 2.41, |
|
"eval_steps_per_second": 0.04, |
|
"step": 206 |
|
}, |
|
{ |
|
"epoch": 2.2127659574468086, |
|
"grad_norm": 0.0011684081982821226, |
|
"learning_rate": 0.00013107519316738224, |
|
"loss": 3.4657, |
|
"step": 208 |
|
}, |
|
{ |
|
"epoch": 2.2127659574468086, |
|
"eval_loss": 4.103911876678467, |
|
"eval_runtime": 94.3018, |
|
"eval_samples_per_second": 3.181, |
|
"eval_steps_per_second": 0.053, |
|
"step": 208 |
|
}, |
|
{ |
|
"epoch": 2.2340425531914896, |
|
"grad_norm": 0.0008167130872607231, |
|
"learning_rate": 0.0001275326203790746, |
|
"loss": 3.4657, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 2.2340425531914896, |
|
"eval_loss": 4.10391092300415, |
|
"eval_runtime": 94.2002, |
|
"eval_samples_per_second": 3.185, |
|
"eval_steps_per_second": 0.053, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 2.25531914893617, |
|
"grad_norm": 0.0542316697537899, |
|
"learning_rate": 0.000123990047590767, |
|
"loss": 3.4657, |
|
"step": 212 |
|
}, |
|
{ |
|
"epoch": 2.25531914893617, |
|
"eval_loss": 4.10391092300415, |
|
"eval_runtime": 93.5571, |
|
"eval_samples_per_second": 3.207, |
|
"eval_steps_per_second": 0.053, |
|
"step": 212 |
|
}, |
|
{ |
|
"epoch": 2.276595744680851, |
|
"grad_norm": 0.002146972343325615, |
|
"learning_rate": 0.00012044747480245936, |
|
"loss": 3.4656, |
|
"step": 214 |
|
}, |
|
{ |
|
"epoch": 2.276595744680851, |
|
"eval_loss": 4.10391092300415, |
|
"eval_runtime": 93.6853, |
|
"eval_samples_per_second": 3.202, |
|
"eval_steps_per_second": 0.053, |
|
"step": 214 |
|
}, |
|
{ |
|
"epoch": 2.297872340425532, |
|
"grad_norm": 0.0018499704310670495, |
|
"learning_rate": 0.00011690490201415174, |
|
"loss": 3.4656, |
|
"step": 216 |
|
}, |
|
{ |
|
"epoch": 2.297872340425532, |
|
"eval_loss": 4.103910446166992, |
|
"eval_runtime": 93.9528, |
|
"eval_samples_per_second": 3.193, |
|
"eval_steps_per_second": 0.053, |
|
"step": 216 |
|
}, |
|
{ |
|
"epoch": 2.3191489361702127, |
|
"grad_norm": 0.0013631999026983976, |
|
"learning_rate": 0.00011336232922584411, |
|
"loss": 3.4657, |
|
"step": 218 |
|
}, |
|
{ |
|
"epoch": 2.3191489361702127, |
|
"eval_loss": 4.103910446166992, |
|
"eval_runtime": 93.3718, |
|
"eval_samples_per_second": 3.213, |
|
"eval_steps_per_second": 0.054, |
|
"step": 218 |
|
}, |
|
{ |
|
"epoch": 2.3404255319148937, |
|
"grad_norm": 0.0021245332900434732, |
|
"learning_rate": 0.00010981975643753648, |
|
"loss": 3.4656, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 2.3404255319148937, |
|
"eval_loss": 4.103908538818359, |
|
"eval_runtime": 116.0788, |
|
"eval_samples_per_second": 2.584, |
|
"eval_steps_per_second": 0.043, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 2.3617021276595747, |
|
"grad_norm": 0.0015864246524870396, |
|
"learning_rate": 0.00010627718364922885, |
|
"loss": 3.4657, |
|
"step": 222 |
|
}, |
|
{ |
|
"epoch": 2.3617021276595747, |
|
"eval_loss": 4.103908061981201, |
|
"eval_runtime": 93.8014, |
|
"eval_samples_per_second": 3.198, |
|
"eval_steps_per_second": 0.053, |
|
"step": 222 |
|
}, |
|
{ |
|
"epoch": 2.382978723404255, |
|
"grad_norm": 0.0017925110878422856, |
|
"learning_rate": 0.00010273461086092122, |
|
"loss": 3.4657, |
|
"step": 224 |
|
}, |
|
{ |
|
"epoch": 2.382978723404255, |
|
"eval_loss": 4.103907108306885, |
|
"eval_runtime": 94.17, |
|
"eval_samples_per_second": 3.186, |
|
"eval_steps_per_second": 0.053, |
|
"step": 224 |
|
}, |
|
{ |
|
"epoch": 2.404255319148936, |
|
"grad_norm": 0.0022581228986382484, |
|
"learning_rate": 9.91920380726136e-05, |
|
"loss": 3.4656, |
|
"step": 226 |
|
}, |
|
{ |
|
"epoch": 2.404255319148936, |
|
"eval_loss": 4.10390567779541, |
|
"eval_runtime": 117.7055, |
|
"eval_samples_per_second": 2.549, |
|
"eval_steps_per_second": 0.042, |
|
"step": 226 |
|
}, |
|
{ |
|
"epoch": 2.425531914893617, |
|
"grad_norm": 0.0026256099808961153, |
|
"learning_rate": 9.564946528430598e-05, |
|
"loss": 3.4656, |
|
"step": 228 |
|
}, |
|
{ |
|
"epoch": 2.425531914893617, |
|
"eval_loss": 4.1039042472839355, |
|
"eval_runtime": 128.097, |
|
"eval_samples_per_second": 2.342, |
|
"eval_steps_per_second": 0.039, |
|
"step": 228 |
|
}, |
|
{ |
|
"epoch": 2.4468085106382977, |
|
"grad_norm": 0.0022717451211065054, |
|
"learning_rate": 9.210689249599833e-05, |
|
"loss": 3.4656, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 2.4468085106382977, |
|
"eval_loss": 4.103902339935303, |
|
"eval_runtime": 113.9038, |
|
"eval_samples_per_second": 2.634, |
|
"eval_steps_per_second": 0.044, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 2.4680851063829787, |
|
"grad_norm": 0.003249780274927616, |
|
"learning_rate": 8.856431970769072e-05, |
|
"loss": 3.4656, |
|
"step": 232 |
|
}, |
|
{ |
|
"epoch": 2.4680851063829787, |
|
"eval_loss": 4.103899955749512, |
|
"eval_runtime": 92.3315, |
|
"eval_samples_per_second": 3.249, |
|
"eval_steps_per_second": 0.054, |
|
"step": 232 |
|
}, |
|
{ |
|
"epoch": 2.4893617021276597, |
|
"grad_norm": 0.0025626318529248238, |
|
"learning_rate": 8.502174691938309e-05, |
|
"loss": 3.4656, |
|
"step": 234 |
|
}, |
|
{ |
|
"epoch": 2.4893617021276597, |
|
"eval_loss": 4.103896617889404, |
|
"eval_runtime": 93.3268, |
|
"eval_samples_per_second": 3.215, |
|
"eval_steps_per_second": 0.054, |
|
"step": 234 |
|
}, |
|
{ |
|
"epoch": 2.5106382978723403, |
|
"grad_norm": 0.004104019142687321, |
|
"learning_rate": 8.147917413107547e-05, |
|
"loss": 3.4655, |
|
"step": 236 |
|
}, |
|
{ |
|
"epoch": 2.5106382978723403, |
|
"eval_loss": 4.103891849517822, |
|
"eval_runtime": 93.0172, |
|
"eval_samples_per_second": 3.225, |
|
"eval_steps_per_second": 0.054, |
|
"step": 236 |
|
}, |
|
{ |
|
"epoch": 2.5319148936170213, |
|
"grad_norm": 0.005048415157943964, |
|
"learning_rate": 7.793660134276782e-05, |
|
"loss": 3.4656, |
|
"step": 238 |
|
}, |
|
{ |
|
"epoch": 2.5319148936170213, |
|
"eval_loss": 4.103887557983398, |
|
"eval_runtime": 103.5236, |
|
"eval_samples_per_second": 2.898, |
|
"eval_steps_per_second": 0.048, |
|
"step": 238 |
|
}, |
|
{ |
|
"epoch": 2.5531914893617023, |
|
"grad_norm": 0.004755980335175991, |
|
"learning_rate": 7.43940285544602e-05, |
|
"loss": 3.4656, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 2.5531914893617023, |
|
"eval_loss": 4.103884220123291, |
|
"eval_runtime": 92.2557, |
|
"eval_samples_per_second": 3.252, |
|
"eval_steps_per_second": 0.054, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 2.574468085106383, |
|
"grad_norm": 0.00427659647539258, |
|
"learning_rate": 7.085145576615258e-05, |
|
"loss": 3.4654, |
|
"step": 242 |
|
}, |
|
{ |
|
"epoch": 2.574468085106383, |
|
"eval_loss": 4.103878498077393, |
|
"eval_runtime": 94.0279, |
|
"eval_samples_per_second": 3.191, |
|
"eval_steps_per_second": 0.053, |
|
"step": 242 |
|
}, |
|
{ |
|
"epoch": 2.595744680851064, |
|
"grad_norm": 0.0044647688046097755, |
|
"learning_rate": 6.730888297784495e-05, |
|
"loss": 3.4655, |
|
"step": 244 |
|
}, |
|
{ |
|
"epoch": 2.595744680851064, |
|
"eval_loss": 4.103872776031494, |
|
"eval_runtime": 98.9692, |
|
"eval_samples_per_second": 3.031, |
|
"eval_steps_per_second": 0.051, |
|
"step": 244 |
|
}, |
|
{ |
|
"epoch": 2.617021276595745, |
|
"grad_norm": 0.00606452114880085, |
|
"learning_rate": 6.37663101895373e-05, |
|
"loss": 3.4654, |
|
"step": 246 |
|
}, |
|
{ |
|
"epoch": 2.617021276595745, |
|
"eval_loss": 4.1038665771484375, |
|
"eval_runtime": 130.5821, |
|
"eval_samples_per_second": 2.297, |
|
"eval_steps_per_second": 0.038, |
|
"step": 246 |
|
}, |
|
{ |
|
"epoch": 2.6382978723404253, |
|
"grad_norm": 0.0025343315210193396, |
|
"learning_rate": 6.022373740122968e-05, |
|
"loss": 3.4655, |
|
"step": 248 |
|
}, |
|
{ |
|
"epoch": 2.6382978723404253, |
|
"eval_loss": 4.103858947753906, |
|
"eval_runtime": 110.6408, |
|
"eval_samples_per_second": 2.711, |
|
"eval_steps_per_second": 0.045, |
|
"step": 248 |
|
}, |
|
{ |
|
"epoch": 2.6595744680851063, |
|
"grad_norm": 0.004990957211703062, |
|
"learning_rate": 5.668116461292206e-05, |
|
"loss": 3.4654, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 2.6595744680851063, |
|
"eval_loss": 4.103850841522217, |
|
"eval_runtime": 110.9115, |
|
"eval_samples_per_second": 2.705, |
|
"eval_steps_per_second": 0.045, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 2.6808510638297873, |
|
"grad_norm": 0.003452206961810589, |
|
"learning_rate": 5.3138591824614426e-05, |
|
"loss": 3.4655, |
|
"step": 252 |
|
}, |
|
{ |
|
"epoch": 2.6808510638297873, |
|
"eval_loss": 4.103842735290527, |
|
"eval_runtime": 116.4041, |
|
"eval_samples_per_second": 2.577, |
|
"eval_steps_per_second": 0.043, |
|
"step": 252 |
|
}, |
|
{ |
|
"epoch": 2.702127659574468, |
|
"grad_norm": 0.01098131388425827, |
|
"learning_rate": 4.95960190363068e-05, |
|
"loss": 3.4655, |
|
"step": 254 |
|
}, |
|
{ |
|
"epoch": 2.702127659574468, |
|
"eval_loss": 4.1038360595703125, |
|
"eval_runtime": 120.3016, |
|
"eval_samples_per_second": 2.494, |
|
"eval_steps_per_second": 0.042, |
|
"step": 254 |
|
}, |
|
{ |
|
"epoch": 2.723404255319149, |
|
"grad_norm": 0.009479314088821411, |
|
"learning_rate": 4.6053446247999166e-05, |
|
"loss": 3.4652, |
|
"step": 256 |
|
}, |
|
{ |
|
"epoch": 2.723404255319149, |
|
"eval_loss": 4.1038289070129395, |
|
"eval_runtime": 102.7773, |
|
"eval_samples_per_second": 2.919, |
|
"eval_steps_per_second": 0.049, |
|
"step": 256 |
|
}, |
|
{ |
|
"epoch": 2.74468085106383, |
|
"grad_norm": 0.018755914643406868, |
|
"learning_rate": 4.251087345969154e-05, |
|
"loss": 3.4646, |
|
"step": 258 |
|
}, |
|
{ |
|
"epoch": 2.74468085106383, |
|
"eval_loss": 4.103816032409668, |
|
"eval_runtime": 155.1062, |
|
"eval_samples_per_second": 1.934, |
|
"eval_steps_per_second": 0.032, |
|
"step": 258 |
|
}, |
|
{ |
|
"epoch": 2.7659574468085104, |
|
"grad_norm": 0.00831508357077837, |
|
"learning_rate": 3.896830067138391e-05, |
|
"loss": 3.4654, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 2.7659574468085104, |
|
"eval_loss": 4.10380220413208, |
|
"eval_runtime": 104.9599, |
|
"eval_samples_per_second": 2.858, |
|
"eval_steps_per_second": 0.048, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 2.7872340425531914, |
|
"grad_norm": 0.00918419286608696, |
|
"learning_rate": 3.542572788307629e-05, |
|
"loss": 3.4654, |
|
"step": 262 |
|
}, |
|
{ |
|
"epoch": 2.7872340425531914, |
|
"eval_loss": 4.103787422180176, |
|
"eval_runtime": 121.0806, |
|
"eval_samples_per_second": 2.478, |
|
"eval_steps_per_second": 0.041, |
|
"step": 262 |
|
}, |
|
{ |
|
"epoch": 2.8085106382978724, |
|
"grad_norm": 0.010791248641908169, |
|
"learning_rate": 3.188315509476865e-05, |
|
"loss": 3.4653, |
|
"step": 264 |
|
}, |
|
{ |
|
"epoch": 2.8085106382978724, |
|
"eval_loss": 4.103773593902588, |
|
"eval_runtime": 117.3875, |
|
"eval_samples_per_second": 2.556, |
|
"eval_steps_per_second": 0.043, |
|
"step": 264 |
|
}, |
|
{ |
|
"epoch": 2.829787234042553, |
|
"grad_norm": 0.018539341166615486, |
|
"learning_rate": 2.834058230646103e-05, |
|
"loss": 3.4651, |
|
"step": 266 |
|
}, |
|
{ |
|
"epoch": 2.829787234042553, |
|
"eval_loss": 4.103758811950684, |
|
"eval_runtime": 134.9208, |
|
"eval_samples_per_second": 2.224, |
|
"eval_steps_per_second": 0.037, |
|
"step": 266 |
|
}, |
|
{ |
|
"epoch": 2.851063829787234, |
|
"grad_norm": 0.013375967741012573, |
|
"learning_rate": 2.47980095181534e-05, |
|
"loss": 3.4649, |
|
"step": 268 |
|
}, |
|
{ |
|
"epoch": 2.851063829787234, |
|
"eval_loss": 4.103740692138672, |
|
"eval_runtime": 112.6527, |
|
"eval_samples_per_second": 2.663, |
|
"eval_steps_per_second": 0.044, |
|
"step": 268 |
|
}, |
|
{ |
|
"epoch": 2.872340425531915, |
|
"grad_norm": 0.01334660779684782, |
|
"learning_rate": 2.125543672984577e-05, |
|
"loss": 3.465, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 2.872340425531915, |
|
"eval_loss": 4.103722095489502, |
|
"eval_runtime": 94.5738, |
|
"eval_samples_per_second": 3.172, |
|
"eval_steps_per_second": 0.053, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 2.8936170212765955, |
|
"grad_norm": 0.017678698524832726, |
|
"learning_rate": 1.7712863941538144e-05, |
|
"loss": 3.4647, |
|
"step": 272 |
|
}, |
|
{ |
|
"epoch": 2.8936170212765955, |
|
"eval_loss": 4.103702068328857, |
|
"eval_runtime": 108.8046, |
|
"eval_samples_per_second": 2.757, |
|
"eval_steps_per_second": 0.046, |
|
"step": 272 |
|
}, |
|
{ |
|
"epoch": 2.9148936170212765, |
|
"grad_norm": 0.023421209305524826, |
|
"learning_rate": 1.4170291153230514e-05, |
|
"loss": 3.4642, |
|
"step": 274 |
|
}, |
|
{ |
|
"epoch": 2.9148936170212765, |
|
"eval_loss": 4.103678226470947, |
|
"eval_runtime": 95.2978, |
|
"eval_samples_per_second": 3.148, |
|
"eval_steps_per_second": 0.052, |
|
"step": 274 |
|
}, |
|
{ |
|
"epoch": 2.9361702127659575, |
|
"grad_norm": 0.02967756614089012, |
|
"learning_rate": 1.0627718364922886e-05, |
|
"loss": 3.4649, |
|
"step": 276 |
|
}, |
|
{ |
|
"epoch": 2.9361702127659575, |
|
"eval_loss": 4.103660583496094, |
|
"eval_runtime": 164.156, |
|
"eval_samples_per_second": 1.828, |
|
"eval_steps_per_second": 0.03, |
|
"step": 276 |
|
}, |
|
{ |
|
"epoch": 2.9574468085106385, |
|
"grad_norm": 0.04368141293525696, |
|
"learning_rate": 7.085145576615257e-06, |
|
"loss": 3.4632, |
|
"step": 278 |
|
}, |
|
{ |
|
"epoch": 2.9574468085106385, |
|
"eval_loss": 4.103642463684082, |
|
"eval_runtime": 135.1093, |
|
"eval_samples_per_second": 2.22, |
|
"eval_steps_per_second": 0.037, |
|
"step": 278 |
|
}, |
|
{ |
|
"epoch": 2.978723404255319, |
|
"grad_norm": 0.024568898603320122, |
|
"learning_rate": 3.5425727883076285e-06, |
|
"loss": 3.4644, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 2.978723404255319, |
|
"eval_loss": 4.103628158569336, |
|
"eval_runtime": 111.5348, |
|
"eval_samples_per_second": 2.69, |
|
"eval_steps_per_second": 0.045, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 3.0, |
|
"grad_norm": 0.04271033778786659, |
|
"learning_rate": 0.0, |
|
"loss": 3.2985, |
|
"step": 282 |
|
}, |
|
{ |
|
"epoch": 3.0, |
|
"eval_loss": 4.103620529174805, |
|
"eval_runtime": 122.9717, |
|
"eval_samples_per_second": 2.44, |
|
"eval_steps_per_second": 0.041, |
|
"step": 282 |
|
} |
|
], |
|
"logging_steps": 2, |
|
"max_steps": 282, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 3, |
|
"save_steps": 500, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 523328480700102.0, |
|
"train_batch_size": 32, |
|
"trial_name": null, |
|
"trial_params": { |
|
"_wandb": {}, |
|
"assignments": {}, |
|
"decay": 0.1, |
|
"learning_rate": 0.0004995027631513756, |
|
"metric": "eval/loss", |
|
"per_device_train_batch_size": 32 |
|
} |
|
} |
|
|