smolm-autoreg-bpe-counterfactual_babylm_aann_low_variability_numeral-seed_1024-1e-3
/
trainer_state.json
{ | |
"best_metric": null, | |
"best_model_checkpoint": null, | |
"epoch": 20.0, | |
"eval_steps": 500, | |
"global_step": 371860, | |
"is_hyper_param_search": false, | |
"is_local_process_zero": true, | |
"is_world_process_zero": true, | |
"log_history": [ | |
{ | |
"epoch": 0.05, | |
"grad_norm": 0.8706823587417603, | |
"learning_rate": 3.125e-05, | |
"loss": 6.2283, | |
"step": 1000 | |
}, | |
{ | |
"epoch": 0.11, | |
"grad_norm": 0.9266456365585327, | |
"learning_rate": 6.25e-05, | |
"loss": 5.0066, | |
"step": 2000 | |
}, | |
{ | |
"epoch": 0.16, | |
"grad_norm": 0.9616362452507019, | |
"learning_rate": 9.375e-05, | |
"loss": 4.6763, | |
"step": 3000 | |
}, | |
{ | |
"epoch": 0.22, | |
"grad_norm": 0.7885107398033142, | |
"learning_rate": 0.000125, | |
"loss": 4.4613, | |
"step": 4000 | |
}, | |
{ | |
"epoch": 0.27, | |
"grad_norm": 0.7654786705970764, | |
"learning_rate": 0.00015625, | |
"loss": 4.3065, | |
"step": 5000 | |
}, | |
{ | |
"epoch": 0.32, | |
"grad_norm": 0.7073574662208557, | |
"learning_rate": 0.0001875, | |
"loss": 4.1746, | |
"step": 6000 | |
}, | |
{ | |
"epoch": 0.38, | |
"grad_norm": 0.6433243751525879, | |
"learning_rate": 0.00021875, | |
"loss": 4.075, | |
"step": 7000 | |
}, | |
{ | |
"epoch": 0.43, | |
"grad_norm": 0.6497963070869446, | |
"learning_rate": 0.00025, | |
"loss": 3.9794, | |
"step": 8000 | |
}, | |
{ | |
"epoch": 0.48, | |
"grad_norm": 0.6403814554214478, | |
"learning_rate": 0.00028121875, | |
"loss": 3.9048, | |
"step": 9000 | |
}, | |
{ | |
"epoch": 0.54, | |
"grad_norm": 0.6297944188117981, | |
"learning_rate": 0.00031246875000000003, | |
"loss": 3.8448, | |
"step": 10000 | |
}, | |
{ | |
"epoch": 0.59, | |
"grad_norm": 0.5447595715522766, | |
"learning_rate": 0.00034371875, | |
"loss": 3.7899, | |
"step": 11000 | |
}, | |
{ | |
"epoch": 0.65, | |
"grad_norm": 0.5269156098365784, | |
"learning_rate": 0.0003749375, | |
"loss": 3.7537, | |
"step": 12000 | |
}, | |
{ | |
"epoch": 0.7, | |
"grad_norm": 0.5088472366333008, | |
"learning_rate": 0.0004061875, | |
"loss": 3.7246, | |
"step": 13000 | |
}, | |
{ | |
"epoch": 0.75, | |
"grad_norm": 0.4877682030200958, | |
"learning_rate": 0.00043737500000000005, | |
"loss": 3.6885, | |
"step": 14000 | |
}, | |
{ | |
"epoch": 0.81, | |
"grad_norm": 0.45420414209365845, | |
"learning_rate": 0.000468625, | |
"loss": 3.6676, | |
"step": 15000 | |
}, | |
{ | |
"epoch": 0.86, | |
"grad_norm": 0.44210752844810486, | |
"learning_rate": 0.000499875, | |
"loss": 3.6451, | |
"step": 16000 | |
}, | |
{ | |
"epoch": 0.91, | |
"grad_norm": 0.41903868317604065, | |
"learning_rate": 0.000531125, | |
"loss": 3.6174, | |
"step": 17000 | |
}, | |
{ | |
"epoch": 0.97, | |
"grad_norm": 0.4097944498062134, | |
"learning_rate": 0.0005623749999999999, | |
"loss": 3.598, | |
"step": 18000 | |
}, | |
{ | |
"epoch": 1.0, | |
"eval_accuracy": 0.35856486935076, | |
"eval_loss": 3.8055434226989746, | |
"eval_runtime": 153.4601, | |
"eval_samples_per_second": 377.434, | |
"eval_steps_per_second": 5.904, | |
"step": 18593 | |
}, | |
{ | |
"epoch": 1.02, | |
"grad_norm": 0.36287716031074524, | |
"learning_rate": 0.0005935625, | |
"loss": 3.5686, | |
"step": 19000 | |
}, | |
{ | |
"epoch": 1.08, | |
"grad_norm": 0.34093010425567627, | |
"learning_rate": 0.0006248125, | |
"loss": 3.5426, | |
"step": 20000 | |
}, | |
{ | |
"epoch": 1.13, | |
"grad_norm": 0.3330960273742676, | |
"learning_rate": 0.0006560625, | |
"loss": 3.5376, | |
"step": 21000 | |
}, | |
{ | |
"epoch": 1.18, | |
"grad_norm": 0.30257412791252136, | |
"learning_rate": 0.00068728125, | |
"loss": 3.5228, | |
"step": 22000 | |
}, | |
{ | |
"epoch": 1.24, | |
"grad_norm": 0.2785021960735321, | |
"learning_rate": 0.00071853125, | |
"loss": 3.5095, | |
"step": 23000 | |
}, | |
{ | |
"epoch": 1.29, | |
"grad_norm": 0.2798495292663574, | |
"learning_rate": 0.00074978125, | |
"loss": 3.502, | |
"step": 24000 | |
}, | |
{ | |
"epoch": 1.34, | |
"grad_norm": 0.27469247579574585, | |
"learning_rate": 0.0007810312499999999, | |
"loss": 3.4931, | |
"step": 25000 | |
}, | |
{ | |
"epoch": 1.4, | |
"grad_norm": 0.28253281116485596, | |
"learning_rate": 0.00081225, | |
"loss": 3.4818, | |
"step": 26000 | |
}, | |
{ | |
"epoch": 1.45, | |
"grad_norm": 0.2425285279750824, | |
"learning_rate": 0.0008435000000000001, | |
"loss": 3.4757, | |
"step": 27000 | |
}, | |
{ | |
"epoch": 1.51, | |
"grad_norm": 0.26154839992523193, | |
"learning_rate": 0.00087471875, | |
"loss": 3.465, | |
"step": 28000 | |
}, | |
{ | |
"epoch": 1.56, | |
"grad_norm": 0.24984851479530334, | |
"learning_rate": 0.00090596875, | |
"loss": 3.4591, | |
"step": 29000 | |
}, | |
{ | |
"epoch": 1.61, | |
"grad_norm": 0.24280162155628204, | |
"learning_rate": 0.00093721875, | |
"loss": 3.4481, | |
"step": 30000 | |
}, | |
{ | |
"epoch": 1.67, | |
"grad_norm": 0.2471531629562378, | |
"learning_rate": 0.00096846875, | |
"loss": 3.4382, | |
"step": 31000 | |
}, | |
{ | |
"epoch": 1.72, | |
"grad_norm": 0.21386803686618805, | |
"learning_rate": 0.0009996875, | |
"loss": 3.4374, | |
"step": 32000 | |
}, | |
{ | |
"epoch": 1.77, | |
"grad_norm": 0.23056912422180176, | |
"learning_rate": 0.0009970870358382863, | |
"loss": 3.4262, | |
"step": 33000 | |
}, | |
{ | |
"epoch": 1.83, | |
"grad_norm": 0.22734513878822327, | |
"learning_rate": 0.0009941446477961513, | |
"loss": 3.4206, | |
"step": 34000 | |
}, | |
{ | |
"epoch": 1.88, | |
"grad_norm": 0.27341413497924805, | |
"learning_rate": 0.0009912052021420585, | |
"loss": 3.4028, | |
"step": 35000 | |
}, | |
{ | |
"epoch": 1.94, | |
"grad_norm": 0.2197876125574112, | |
"learning_rate": 0.0009882628140999235, | |
"loss": 3.3941, | |
"step": 36000 | |
}, | |
{ | |
"epoch": 1.99, | |
"grad_norm": 0.2100793868303299, | |
"learning_rate": 0.0009853204260577885, | |
"loss": 3.3819, | |
"step": 37000 | |
}, | |
{ | |
"epoch": 2.0, | |
"eval_accuracy": 0.3806616512481747, | |
"eval_loss": 3.5619659423828125, | |
"eval_runtime": 155.2921, | |
"eval_samples_per_second": 372.981, | |
"eval_steps_per_second": 5.834, | |
"step": 37186 | |
}, | |
{ | |
"epoch": 2.04, | |
"grad_norm": 0.2338385134935379, | |
"learning_rate": 0.0009823839227917379, | |
"loss": 3.3373, | |
"step": 38000 | |
}, | |
{ | |
"epoch": 2.1, | |
"grad_norm": 0.20092932879924774, | |
"learning_rate": 0.0009794415347496028, | |
"loss": 3.3246, | |
"step": 39000 | |
}, | |
{ | |
"epoch": 2.15, | |
"grad_norm": 0.2200673222541809, | |
"learning_rate": 0.0009764991467074677, | |
"loss": 3.3266, | |
"step": 40000 | |
}, | |
{ | |
"epoch": 2.21, | |
"grad_norm": 0.2211606204509735, | |
"learning_rate": 0.0009735567586653328, | |
"loss": 3.3149, | |
"step": 41000 | |
}, | |
{ | |
"epoch": 2.26, | |
"grad_norm": 0.19502712786197662, | |
"learning_rate": 0.0009706143706231978, | |
"loss": 3.3163, | |
"step": 42000 | |
}, | |
{ | |
"epoch": 2.31, | |
"grad_norm": 0.2075120508670807, | |
"learning_rate": 0.000967674924969105, | |
"loss": 3.3074, | |
"step": 43000 | |
}, | |
{ | |
"epoch": 2.37, | |
"grad_norm": 0.22068411111831665, | |
"learning_rate": 0.0009647325369269699, | |
"loss": 3.3013, | |
"step": 44000 | |
}, | |
{ | |
"epoch": 2.42, | |
"grad_norm": 0.19794422388076782, | |
"learning_rate": 0.000961793091272877, | |
"loss": 3.3064, | |
"step": 45000 | |
}, | |
{ | |
"epoch": 2.47, | |
"grad_norm": 0.2203867882490158, | |
"learning_rate": 0.0009588507032307421, | |
"loss": 3.2953, | |
"step": 46000 | |
}, | |
{ | |
"epoch": 2.53, | |
"grad_norm": 0.20799222588539124, | |
"learning_rate": 0.0009559112575766492, | |
"loss": 3.2912, | |
"step": 47000 | |
}, | |
{ | |
"epoch": 2.58, | |
"grad_norm": 0.22151567041873932, | |
"learning_rate": 0.0009529688695345143, | |
"loss": 3.2828, | |
"step": 48000 | |
}, | |
{ | |
"epoch": 2.64, | |
"grad_norm": 0.21492493152618408, | |
"learning_rate": 0.0009500264814923793, | |
"loss": 3.2807, | |
"step": 49000 | |
}, | |
{ | |
"epoch": 2.69, | |
"grad_norm": 0.22774606943130493, | |
"learning_rate": 0.0009470840934502442, | |
"loss": 3.277, | |
"step": 50000 | |
}, | |
{ | |
"epoch": 2.74, | |
"grad_norm": 0.20704419910907745, | |
"learning_rate": 0.0009441417054081093, | |
"loss": 3.2756, | |
"step": 51000 | |
}, | |
{ | |
"epoch": 2.8, | |
"grad_norm": 0.18878625333309174, | |
"learning_rate": 0.0009412052021420585, | |
"loss": 3.2698, | |
"step": 52000 | |
}, | |
{ | |
"epoch": 2.85, | |
"grad_norm": 0.21784694492816925, | |
"learning_rate": 0.0009382628140999236, | |
"loss": 3.2633, | |
"step": 53000 | |
}, | |
{ | |
"epoch": 2.9, | |
"grad_norm": 0.19541580975055695, | |
"learning_rate": 0.0009353204260577886, | |
"loss": 3.2603, | |
"step": 54000 | |
}, | |
{ | |
"epoch": 2.96, | |
"grad_norm": 0.1930837631225586, | |
"learning_rate": 0.0009323780380156535, | |
"loss": 3.2622, | |
"step": 55000 | |
}, | |
{ | |
"epoch": 3.0, | |
"eval_accuracy": 0.3929298036900791, | |
"eval_loss": 3.4625182151794434, | |
"eval_runtime": 154.3214, | |
"eval_samples_per_second": 375.327, | |
"eval_steps_per_second": 5.871, | |
"step": 55779 | |
}, | |
{ | |
"epoch": 3.01, | |
"grad_norm": 0.20553229749202728, | |
"learning_rate": 0.0009294356499735185, | |
"loss": 3.2366, | |
"step": 56000 | |
}, | |
{ | |
"epoch": 3.07, | |
"grad_norm": 0.20729704201221466, | |
"learning_rate": 0.0009264962043194257, | |
"loss": 3.1896, | |
"step": 57000 | |
}, | |
{ | |
"epoch": 3.12, | |
"grad_norm": 0.20280729234218597, | |
"learning_rate": 0.0009235538162772906, | |
"loss": 3.1933, | |
"step": 58000 | |
}, | |
{ | |
"epoch": 3.17, | |
"grad_norm": 0.1880486160516739, | |
"learning_rate": 0.0009206114282351557, | |
"loss": 3.1987, | |
"step": 59000 | |
}, | |
{ | |
"epoch": 3.23, | |
"grad_norm": 0.2021629512310028, | |
"learning_rate": 0.0009176719825810627, | |
"loss": 3.2011, | |
"step": 60000 | |
}, | |
{ | |
"epoch": 3.28, | |
"grad_norm": 0.24562139809131622, | |
"learning_rate": 0.0009147295945389278, | |
"loss": 3.1928, | |
"step": 61000 | |
}, | |
{ | |
"epoch": 3.33, | |
"grad_norm": 0.18961690366268158, | |
"learning_rate": 0.0009117901488848349, | |
"loss": 3.1966, | |
"step": 62000 | |
}, | |
{ | |
"epoch": 3.39, | |
"grad_norm": 0.2056853473186493, | |
"learning_rate": 0.0009088477608426999, | |
"loss": 3.1888, | |
"step": 63000 | |
}, | |
{ | |
"epoch": 3.44, | |
"grad_norm": 0.18212585151195526, | |
"learning_rate": 0.000905905372800565, | |
"loss": 3.1906, | |
"step": 64000 | |
}, | |
{ | |
"epoch": 3.5, | |
"grad_norm": 0.21010248363018036, | |
"learning_rate": 0.000902965927146472, | |
"loss": 3.1925, | |
"step": 65000 | |
}, | |
{ | |
"epoch": 3.55, | |
"grad_norm": 0.21237117052078247, | |
"learning_rate": 0.0009000235391043371, | |
"loss": 3.1918, | |
"step": 66000 | |
}, | |
{ | |
"epoch": 3.6, | |
"grad_norm": 0.24303843080997467, | |
"learning_rate": 0.0008970840934502442, | |
"loss": 3.1918, | |
"step": 67000 | |
}, | |
{ | |
"epoch": 3.66, | |
"grad_norm": 0.19125622510910034, | |
"learning_rate": 0.0008941417054081092, | |
"loss": 3.1866, | |
"step": 68000 | |
}, | |
{ | |
"epoch": 3.71, | |
"grad_norm": 0.23382623493671417, | |
"learning_rate": 0.0008912022597540164, | |
"loss": 3.1844, | |
"step": 69000 | |
}, | |
{ | |
"epoch": 3.76, | |
"grad_norm": 0.21234822273254395, | |
"learning_rate": 0.0008882598717118814, | |
"loss": 3.1852, | |
"step": 70000 | |
}, | |
{ | |
"epoch": 3.82, | |
"grad_norm": 0.2681824564933777, | |
"learning_rate": 0.0008853204260577885, | |
"loss": 3.1855, | |
"step": 71000 | |
}, | |
{ | |
"epoch": 3.87, | |
"grad_norm": 0.19967304170131683, | |
"learning_rate": 0.0008823780380156535, | |
"loss": 3.1781, | |
"step": 72000 | |
}, | |
{ | |
"epoch": 3.93, | |
"grad_norm": 0.19710466265678406, | |
"learning_rate": 0.0008794385923615607, | |
"loss": 3.1804, | |
"step": 73000 | |
}, | |
{ | |
"epoch": 3.98, | |
"grad_norm": 0.2174214869737625, | |
"learning_rate": 0.0008764962043194257, | |
"loss": 3.1788, | |
"step": 74000 | |
}, | |
{ | |
"epoch": 4.0, | |
"eval_accuracy": 0.39836750674153854, | |
"eval_loss": 3.4131386280059814, | |
"eval_runtime": 154.2916, | |
"eval_samples_per_second": 375.4, | |
"eval_steps_per_second": 5.872, | |
"step": 74372 | |
}, | |
{ | |
"epoch": 4.03, | |
"grad_norm": 0.23124143481254578, | |
"learning_rate": 0.0008735567586653328, | |
"loss": 3.1339, | |
"step": 75000 | |
}, | |
{ | |
"epoch": 4.09, | |
"grad_norm": 0.2329677790403366, | |
"learning_rate": 0.0008706143706231978, | |
"loss": 3.1142, | |
"step": 76000 | |
}, | |
{ | |
"epoch": 4.14, | |
"grad_norm": 0.21693040430545807, | |
"learning_rate": 0.0008676749249691049, | |
"loss": 3.1174, | |
"step": 77000 | |
}, | |
{ | |
"epoch": 4.2, | |
"grad_norm": 0.2263847142457962, | |
"learning_rate": 0.00086473253692697, | |
"loss": 3.1182, | |
"step": 78000 | |
}, | |
{ | |
"epoch": 4.25, | |
"grad_norm": 0.23874357342720032, | |
"learning_rate": 0.0008617930912728771, | |
"loss": 3.1247, | |
"step": 79000 | |
}, | |
{ | |
"epoch": 4.3, | |
"grad_norm": 0.21914631128311157, | |
"learning_rate": 0.0008588507032307421, | |
"loss": 3.1239, | |
"step": 80000 | |
}, | |
{ | |
"epoch": 4.36, | |
"grad_norm": 0.24384135007858276, | |
"learning_rate": 0.0008559083151886072, | |
"loss": 3.1273, | |
"step": 81000 | |
}, | |
{ | |
"epoch": 4.41, | |
"grad_norm": 0.20350953936576843, | |
"learning_rate": 0.0008529688695345142, | |
"loss": 3.1251, | |
"step": 82000 | |
}, | |
{ | |
"epoch": 4.46, | |
"grad_norm": 0.19013893604278564, | |
"learning_rate": 0.0008500264814923793, | |
"loss": 3.1249, | |
"step": 83000 | |
}, | |
{ | |
"epoch": 4.52, | |
"grad_norm": 0.20296384394168854, | |
"learning_rate": 0.0008470840934502443, | |
"loss": 3.1283, | |
"step": 84000 | |
}, | |
{ | |
"epoch": 4.57, | |
"grad_norm": 0.19276568293571472, | |
"learning_rate": 0.0008441417054081092, | |
"loss": 3.1257, | |
"step": 85000 | |
}, | |
{ | |
"epoch": 4.63, | |
"grad_norm": 0.196748286485672, | |
"learning_rate": 0.0008412022597540165, | |
"loss": 3.1242, | |
"step": 86000 | |
}, | |
{ | |
"epoch": 4.68, | |
"grad_norm": 0.20028959214687347, | |
"learning_rate": 0.0008382598717118813, | |
"loss": 3.1246, | |
"step": 87000 | |
}, | |
{ | |
"epoch": 4.73, | |
"grad_norm": 0.20650836825370789, | |
"learning_rate": 0.0008353174836697463, | |
"loss": 3.1273, | |
"step": 88000 | |
}, | |
{ | |
"epoch": 4.79, | |
"grad_norm": 0.2691803574562073, | |
"learning_rate": 0.0008323780380156535, | |
"loss": 3.1219, | |
"step": 89000 | |
}, | |
{ | |
"epoch": 4.84, | |
"grad_norm": 0.20094037055969238, | |
"learning_rate": 0.0008294385923615606, | |
"loss": 3.1236, | |
"step": 90000 | |
}, | |
{ | |
"epoch": 4.89, | |
"grad_norm": 0.23795652389526367, | |
"learning_rate": 0.0008264962043194257, | |
"loss": 3.1256, | |
"step": 91000 | |
}, | |
{ | |
"epoch": 4.95, | |
"grad_norm": 0.20850218832492828, | |
"learning_rate": 0.0008235538162772906, | |
"loss": 3.1251, | |
"step": 92000 | |
}, | |
{ | |
"epoch": 5.0, | |
"eval_accuracy": 0.40224626364489346, | |
"eval_loss": 3.385948419570923, | |
"eval_runtime": 154.3892, | |
"eval_samples_per_second": 375.162, | |
"eval_steps_per_second": 5.868, | |
"step": 92965 | |
}, | |
{ | |
"epoch": 5.0, | |
"grad_norm": 0.19185079634189606, | |
"learning_rate": 0.0008206114282351556, | |
"loss": 3.1207, | |
"step": 93000 | |
}, | |
{ | |
"epoch": 5.06, | |
"grad_norm": 0.2044549584388733, | |
"learning_rate": 0.0008176719825810628, | |
"loss": 3.0552, | |
"step": 94000 | |
}, | |
{ | |
"epoch": 5.11, | |
"grad_norm": 0.2207447588443756, | |
"learning_rate": 0.0008147295945389278, | |
"loss": 3.0595, | |
"step": 95000 | |
}, | |
{ | |
"epoch": 5.16, | |
"grad_norm": 0.2116185575723648, | |
"learning_rate": 0.0008117872064967929, | |
"loss": 3.0654, | |
"step": 96000 | |
}, | |
{ | |
"epoch": 5.22, | |
"grad_norm": 0.20827573537826538, | |
"learning_rate": 0.0008088477608427, | |
"loss": 3.0653, | |
"step": 97000 | |
}, | |
{ | |
"epoch": 5.27, | |
"grad_norm": 0.21885184943675995, | |
"learning_rate": 0.0008059053728005649, | |
"loss": 3.0689, | |
"step": 98000 | |
}, | |
{ | |
"epoch": 5.32, | |
"grad_norm": 0.22063520550727844, | |
"learning_rate": 0.0008029659271464721, | |
"loss": 3.0734, | |
"step": 99000 | |
}, | |
{ | |
"epoch": 5.38, | |
"grad_norm": 0.21756470203399658, | |
"learning_rate": 0.0008000235391043371, | |
"loss": 3.0761, | |
"step": 100000 | |
}, | |
{ | |
"epoch": 5.43, | |
"grad_norm": 0.2308843582868576, | |
"learning_rate": 0.0007970811510622022, | |
"loss": 3.0764, | |
"step": 101000 | |
}, | |
{ | |
"epoch": 5.49, | |
"grad_norm": 0.2255343496799469, | |
"learning_rate": 0.0007941387630200672, | |
"loss": 3.0786, | |
"step": 102000 | |
}, | |
{ | |
"epoch": 5.54, | |
"grad_norm": 0.21067161858081818, | |
"learning_rate": 0.0007912022597540164, | |
"loss": 3.0786, | |
"step": 103000 | |
}, | |
{ | |
"epoch": 5.59, | |
"grad_norm": 0.21768967807292938, | |
"learning_rate": 0.0007882598717118814, | |
"loss": 3.0773, | |
"step": 104000 | |
}, | |
{ | |
"epoch": 5.65, | |
"grad_norm": 0.20667089521884918, | |
"learning_rate": 0.0007853174836697464, | |
"loss": 3.0795, | |
"step": 105000 | |
}, | |
{ | |
"epoch": 5.7, | |
"grad_norm": 0.21367168426513672, | |
"learning_rate": 0.0007823750956276114, | |
"loss": 3.082, | |
"step": 106000 | |
}, | |
{ | |
"epoch": 5.75, | |
"grad_norm": 0.2038014829158783, | |
"learning_rate": 0.0007794327075854764, | |
"loss": 3.0785, | |
"step": 107000 | |
}, | |
{ | |
"epoch": 5.81, | |
"grad_norm": 0.21575333178043365, | |
"learning_rate": 0.0007764932619313834, | |
"loss": 3.0808, | |
"step": 108000 | |
}, | |
{ | |
"epoch": 5.86, | |
"grad_norm": 0.20344999432563782, | |
"learning_rate": 0.0007735508738892485, | |
"loss": 3.0829, | |
"step": 109000 | |
}, | |
{ | |
"epoch": 5.92, | |
"grad_norm": 0.2332640290260315, | |
"learning_rate": 0.0007706114282351556, | |
"loss": 3.0769, | |
"step": 110000 | |
}, | |
{ | |
"epoch": 5.97, | |
"grad_norm": 0.23659543693065643, | |
"learning_rate": 0.0007676690401930207, | |
"loss": 3.0765, | |
"step": 111000 | |
}, | |
{ | |
"epoch": 6.0, | |
"eval_accuracy": 0.4050011228899796, | |
"eval_loss": 3.3707284927368164, | |
"eval_runtime": 153.9554, | |
"eval_samples_per_second": 376.219, | |
"eval_steps_per_second": 5.885, | |
"step": 111558 | |
}, | |
{ | |
"epoch": 6.02, | |
"grad_norm": 0.21739870309829712, | |
"learning_rate": 0.0007647266521508857, | |
"loss": 3.0492, | |
"step": 112000 | |
}, | |
{ | |
"epoch": 6.08, | |
"grad_norm": 0.23045282065868378, | |
"learning_rate": 0.0007617872064967927, | |
"loss": 3.0123, | |
"step": 113000 | |
}, | |
{ | |
"epoch": 6.13, | |
"grad_norm": 0.2114897221326828, | |
"learning_rate": 0.0007588448184546578, | |
"loss": 3.0196, | |
"step": 114000 | |
}, | |
{ | |
"epoch": 6.19, | |
"grad_norm": 0.21402007341384888, | |
"learning_rate": 0.0007559053728005649, | |
"loss": 3.0229, | |
"step": 115000 | |
}, | |
{ | |
"epoch": 6.24, | |
"grad_norm": 0.22187209129333496, | |
"learning_rate": 0.00075296298475843, | |
"loss": 3.0269, | |
"step": 116000 | |
}, | |
{ | |
"epoch": 6.29, | |
"grad_norm": 0.2102159708738327, | |
"learning_rate": 0.000750020596716295, | |
"loss": 3.0273, | |
"step": 117000 | |
}, | |
{ | |
"epoch": 6.35, | |
"grad_norm": 0.2769564092159271, | |
"learning_rate": 0.000747081151062202, | |
"loss": 3.0318, | |
"step": 118000 | |
}, | |
{ | |
"epoch": 6.4, | |
"grad_norm": 0.2173321396112442, | |
"learning_rate": 0.0007441387630200671, | |
"loss": 3.0331, | |
"step": 119000 | |
}, | |
{ | |
"epoch": 6.45, | |
"grad_norm": 0.24418728053569794, | |
"learning_rate": 0.0007411993173659742, | |
"loss": 3.0376, | |
"step": 120000 | |
}, | |
{ | |
"epoch": 6.51, | |
"grad_norm": 0.23827865719795227, | |
"learning_rate": 0.0007382569293238393, | |
"loss": 3.0404, | |
"step": 121000 | |
}, | |
{ | |
"epoch": 6.56, | |
"grad_norm": 0.2125864326953888, | |
"learning_rate": 0.0007353174836697464, | |
"loss": 3.0366, | |
"step": 122000 | |
}, | |
{ | |
"epoch": 6.62, | |
"grad_norm": 0.21209298074245453, | |
"learning_rate": 0.0007323750956276114, | |
"loss": 3.0408, | |
"step": 123000 | |
}, | |
{ | |
"epoch": 6.67, | |
"grad_norm": 0.21926939487457275, | |
"learning_rate": 0.0007294327075854764, | |
"loss": 3.0435, | |
"step": 124000 | |
}, | |
{ | |
"epoch": 6.72, | |
"grad_norm": 0.2240251749753952, | |
"learning_rate": 0.0007264932619313835, | |
"loss": 3.0439, | |
"step": 125000 | |
}, | |
{ | |
"epoch": 6.78, | |
"grad_norm": 0.21099534630775452, | |
"learning_rate": 0.0007235508738892486, | |
"loss": 3.0415, | |
"step": 126000 | |
}, | |
{ | |
"epoch": 6.83, | |
"grad_norm": 0.20424066483974457, | |
"learning_rate": 0.0007206084858471136, | |
"loss": 3.0444, | |
"step": 127000 | |
}, | |
{ | |
"epoch": 6.88, | |
"grad_norm": 0.22034712135791779, | |
"learning_rate": 0.0007176660978049785, | |
"loss": 3.0448, | |
"step": 128000 | |
}, | |
{ | |
"epoch": 6.94, | |
"grad_norm": 0.23952622711658478, | |
"learning_rate": 0.0007147237097628436, | |
"loss": 3.0427, | |
"step": 129000 | |
}, | |
{ | |
"epoch": 6.99, | |
"grad_norm": 0.21370893716812134, | |
"learning_rate": 0.0007117842641087506, | |
"loss": 3.0458, | |
"step": 130000 | |
}, | |
{ | |
"epoch": 7.0, | |
"eval_accuracy": 0.4076835636248676, | |
"eval_loss": 3.3522255420684814, | |
"eval_runtime": 154.4925, | |
"eval_samples_per_second": 374.911, | |
"eval_steps_per_second": 5.864, | |
"step": 130151 | |
}, | |
{ | |
"epoch": 7.05, | |
"grad_norm": 0.22439521551132202, | |
"learning_rate": 0.0007088448184546579, | |
"loss": 2.9871, | |
"step": 131000 | |
}, | |
{ | |
"epoch": 7.1, | |
"grad_norm": 0.24310387670993805, | |
"learning_rate": 0.0007059024304125228, | |
"loss": 2.98, | |
"step": 132000 | |
}, | |
{ | |
"epoch": 7.15, | |
"grad_norm": 0.22332601249217987, | |
"learning_rate": 0.0007029600423703878, | |
"loss": 2.9863, | |
"step": 133000 | |
}, | |
{ | |
"epoch": 7.21, | |
"grad_norm": 0.24013490974903107, | |
"learning_rate": 0.0007000176543282529, | |
"loss": 2.9879, | |
"step": 134000 | |
}, | |
{ | |
"epoch": 7.26, | |
"grad_norm": 0.23142094910144806, | |
"learning_rate": 0.0006970782086741599, | |
"loss": 2.9914, | |
"step": 135000 | |
}, | |
{ | |
"epoch": 7.31, | |
"grad_norm": 0.236104354262352, | |
"learning_rate": 0.0006941358206320249, | |
"loss": 2.9964, | |
"step": 136000 | |
}, | |
{ | |
"epoch": 7.37, | |
"grad_norm": 0.33047351241111755, | |
"learning_rate": 0.0006911963749779321, | |
"loss": 2.9986, | |
"step": 137000 | |
}, | |
{ | |
"epoch": 7.42, | |
"grad_norm": 0.26023513078689575, | |
"learning_rate": 0.0006882539869357971, | |
"loss": 2.9995, | |
"step": 138000 | |
}, | |
{ | |
"epoch": 7.48, | |
"grad_norm": 0.26877066493034363, | |
"learning_rate": 0.0006853115988936622, | |
"loss": 3.0005, | |
"step": 139000 | |
}, | |
{ | |
"epoch": 7.53, | |
"grad_norm": 0.2331291288137436, | |
"learning_rate": 0.0006823721532395692, | |
"loss": 3.0057, | |
"step": 140000 | |
}, | |
{ | |
"epoch": 7.58, | |
"grad_norm": 0.21710553765296936, | |
"learning_rate": 0.0006794297651974342, | |
"loss": 3.0107, | |
"step": 141000 | |
}, | |
{ | |
"epoch": 7.64, | |
"grad_norm": 0.24514536559581757, | |
"learning_rate": 0.0006764903195433414, | |
"loss": 3.0055, | |
"step": 142000 | |
}, | |
{ | |
"epoch": 7.69, | |
"grad_norm": 0.21572381258010864, | |
"learning_rate": 0.0006735508738892485, | |
"loss": 3.0102, | |
"step": 143000 | |
}, | |
{ | |
"epoch": 7.74, | |
"grad_norm": 0.23894068598747253, | |
"learning_rate": 0.0006706084858471136, | |
"loss": 3.0101, | |
"step": 144000 | |
}, | |
{ | |
"epoch": 7.8, | |
"grad_norm": 0.20779648423194885, | |
"learning_rate": 0.0006676660978049786, | |
"loss": 3.0136, | |
"step": 145000 | |
}, | |
{ | |
"epoch": 7.85, | |
"grad_norm": 0.21832165122032166, | |
"learning_rate": 0.0006647237097628435, | |
"loss": 3.0144, | |
"step": 146000 | |
}, | |
{ | |
"epoch": 7.91, | |
"grad_norm": 0.2295515537261963, | |
"learning_rate": 0.0006617813217207086, | |
"loss": 3.0162, | |
"step": 147000 | |
}, | |
{ | |
"epoch": 7.96, | |
"grad_norm": 0.21139173209667206, | |
"learning_rate": 0.0006588418760666157, | |
"loss": 3.0116, | |
"step": 148000 | |
}, | |
{ | |
"epoch": 8.0, | |
"eval_accuracy": 0.4079684679864168, | |
"eval_loss": 3.3560173511505127, | |
"eval_runtime": 154.3168, | |
"eval_samples_per_second": 375.338, | |
"eval_steps_per_second": 5.871, | |
"step": 148744 | |
}, | |
{ | |
"epoch": 8.01, | |
"grad_norm": 0.2352878898382187, | |
"learning_rate": 0.0006558994880244807, | |
"loss": 2.9921, | |
"step": 149000 | |
}, | |
{ | |
"epoch": 8.07, | |
"grad_norm": 0.26401156187057495, | |
"learning_rate": 0.0006529600423703879, | |
"loss": 2.9483, | |
"step": 150000 | |
}, | |
{ | |
"epoch": 8.12, | |
"grad_norm": 0.2650126814842224, | |
"learning_rate": 0.0006500176543282527, | |
"loss": 2.9538, | |
"step": 151000 | |
}, | |
{ | |
"epoch": 8.18, | |
"grad_norm": 0.2173539251089096, | |
"learning_rate": 0.00064707820867416, | |
"loss": 2.9592, | |
"step": 152000 | |
}, | |
{ | |
"epoch": 8.23, | |
"grad_norm": 0.23582415282726288, | |
"learning_rate": 0.0006441358206320249, | |
"loss": 2.9674, | |
"step": 153000 | |
}, | |
{ | |
"epoch": 8.28, | |
"grad_norm": 0.24998264014720917, | |
"learning_rate": 0.0006411963749779322, | |
"loss": 2.9634, | |
"step": 154000 | |
}, | |
{ | |
"epoch": 8.34, | |
"grad_norm": 0.2233651876449585, | |
"learning_rate": 0.0006382539869357971, | |
"loss": 2.9725, | |
"step": 155000 | |
}, | |
{ | |
"epoch": 8.39, | |
"grad_norm": 0.2446517050266266, | |
"learning_rate": 0.000635311598893662, | |
"loss": 2.9668, | |
"step": 156000 | |
}, | |
{ | |
"epoch": 8.44, | |
"grad_norm": 0.27410104870796204, | |
"learning_rate": 0.0006323692108515271, | |
"loss": 2.9716, | |
"step": 157000 | |
}, | |
{ | |
"epoch": 8.5, | |
"grad_norm": 0.19913798570632935, | |
"learning_rate": 0.0006294268228093921, | |
"loss": 2.9744, | |
"step": 158000 | |
}, | |
{ | |
"epoch": 8.55, | |
"grad_norm": 0.24324330687522888, | |
"learning_rate": 0.0006264844347672571, | |
"loss": 2.975, | |
"step": 159000 | |
}, | |
{ | |
"epoch": 8.61, | |
"grad_norm": 0.2302367389202118, | |
"learning_rate": 0.0006235479315012064, | |
"loss": 2.9765, | |
"step": 160000 | |
}, | |
{ | |
"epoch": 8.66, | |
"grad_norm": 0.25450703501701355, | |
"learning_rate": 0.0006206055434590713, | |
"loss": 2.9762, | |
"step": 161000 | |
}, | |
{ | |
"epoch": 8.71, | |
"grad_norm": 0.2539065480232239, | |
"learning_rate": 0.0006176631554169364, | |
"loss": 2.9819, | |
"step": 162000 | |
}, | |
{ | |
"epoch": 8.77, | |
"grad_norm": 0.2278948873281479, | |
"learning_rate": 0.0006147207673748014, | |
"loss": 2.9809, | |
"step": 163000 | |
}, | |
{ | |
"epoch": 8.82, | |
"grad_norm": 0.23980122804641724, | |
"learning_rate": 0.0006117813217207086, | |
"loss": 2.9829, | |
"step": 164000 | |
}, | |
{ | |
"epoch": 8.87, | |
"grad_norm": 0.25505828857421875, | |
"learning_rate": 0.0006088389336785736, | |
"loss": 2.9845, | |
"step": 165000 | |
}, | |
{ | |
"epoch": 8.93, | |
"grad_norm": 0.26612409949302673, | |
"learning_rate": 0.0006058965456364386, | |
"loss": 2.9874, | |
"step": 166000 | |
}, | |
{ | |
"epoch": 8.98, | |
"grad_norm": 0.2189650535583496, | |
"learning_rate": 0.0006029541575943035, | |
"loss": 2.9832, | |
"step": 167000 | |
}, | |
{ | |
"epoch": 9.0, | |
"eval_accuracy": 0.40796309369994566, | |
"eval_loss": 3.3773229122161865, | |
"eval_runtime": 154.2001, | |
"eval_samples_per_second": 375.622, | |
"eval_steps_per_second": 5.875, | |
"step": 167337 | |
}, | |
{ | |
"epoch": 9.04, | |
"grad_norm": 0.27406546473503113, | |
"learning_rate": 0.0006000147119402107, | |
"loss": 2.9377, | |
"step": 168000 | |
}, | |
{ | |
"epoch": 9.09, | |
"grad_norm": 0.2886415719985962, | |
"learning_rate": 0.0005970723238980756, | |
"loss": 2.9216, | |
"step": 169000 | |
}, | |
{ | |
"epoch": 9.14, | |
"grad_norm": 0.30712640285491943, | |
"learning_rate": 0.0005941299358559407, | |
"loss": 2.9295, | |
"step": 170000 | |
}, | |
{ | |
"epoch": 9.2, | |
"grad_norm": 0.24219301342964172, | |
"learning_rate": 0.0005911875478138057, | |
"loss": 2.9333, | |
"step": 171000 | |
}, | |
{ | |
"epoch": 9.25, | |
"grad_norm": 0.24113836884498596, | |
"learning_rate": 0.0005882481021597129, | |
"loss": 2.9348, | |
"step": 172000 | |
}, | |
{ | |
"epoch": 9.3, | |
"grad_norm": 0.26300373673439026, | |
"learning_rate": 0.0005853057141175778, | |
"loss": 2.94, | |
"step": 173000 | |
}, | |
{ | |
"epoch": 9.36, | |
"grad_norm": 0.2541556656360626, | |
"learning_rate": 0.0005823662684634849, | |
"loss": 2.9424, | |
"step": 174000 | |
}, | |
{ | |
"epoch": 9.41, | |
"grad_norm": 0.2480422556400299, | |
"learning_rate": 0.00057942388042135, | |
"loss": 2.9424, | |
"step": 175000 | |
}, | |
{ | |
"epoch": 9.47, | |
"grad_norm": 0.22237752377986908, | |
"learning_rate": 0.0005764844347672571, | |
"loss": 2.9463, | |
"step": 176000 | |
}, | |
{ | |
"epoch": 9.52, | |
"grad_norm": 0.23950238525867462, | |
"learning_rate": 0.0005735420467251222, | |
"loss": 2.9531, | |
"step": 177000 | |
}, | |
{ | |
"epoch": 9.57, | |
"grad_norm": 0.2510799467563629, | |
"learning_rate": 0.0005705996586829871, | |
"loss": 2.952, | |
"step": 178000 | |
}, | |
{ | |
"epoch": 9.63, | |
"grad_norm": 0.23875580728054047, | |
"learning_rate": 0.0005676572706408521, | |
"loss": 2.955, | |
"step": 179000 | |
}, | |
{ | |
"epoch": 9.68, | |
"grad_norm": 0.2786506414413452, | |
"learning_rate": 0.0005647178249867593, | |
"loss": 2.9576, | |
"step": 180000 | |
}, | |
{ | |
"epoch": 9.73, | |
"grad_norm": 0.22881832718849182, | |
"learning_rate": 0.0005617754369446243, | |
"loss": 2.9578, | |
"step": 181000 | |
}, | |
{ | |
"epoch": 9.79, | |
"grad_norm": 0.24316935241222382, | |
"learning_rate": 0.0005588359912905315, | |
"loss": 2.9569, | |
"step": 182000 | |
}, | |
{ | |
"epoch": 9.84, | |
"grad_norm": 0.2705481946468353, | |
"learning_rate": 0.0005558936032483965, | |
"loss": 2.9565, | |
"step": 183000 | |
}, | |
{ | |
"epoch": 9.9, | |
"grad_norm": 0.2366609275341034, | |
"learning_rate": 0.0005529512152062614, | |
"loss": 2.9613, | |
"step": 184000 | |
}, | |
{ | |
"epoch": 9.95, | |
"grad_norm": 0.2798861265182495, | |
"learning_rate": 0.0005500088271641264, | |
"loss": 2.9565, | |
"step": 185000 | |
}, | |
{ | |
"epoch": 10.0, | |
"eval_accuracy": 0.40983898839268323, | |
"eval_loss": 3.351311683654785, | |
"eval_runtime": 153.7734, | |
"eval_samples_per_second": 376.665, | |
"eval_steps_per_second": 5.892, | |
"step": 185930 | |
}, | |
{ | |
"epoch": 10.0, | |
"grad_norm": 0.26953232288360596, | |
"learning_rate": 0.0005470693815100336, | |
"loss": 2.951, | |
"step": 186000 | |
}, | |
{ | |
"epoch": 10.06, | |
"grad_norm": 0.2257799357175827, | |
"learning_rate": 0.0005441269934678985, | |
"loss": 2.8908, | |
"step": 187000 | |
}, | |
{ | |
"epoch": 10.11, | |
"grad_norm": 0.23467403650283813, | |
"learning_rate": 0.0005411875478138058, | |
"loss": 2.9006, | |
"step": 188000 | |
}, | |
{ | |
"epoch": 10.17, | |
"grad_norm": 0.2749410569667816, | |
"learning_rate": 0.0005382451597716706, | |
"loss": 2.9069, | |
"step": 189000 | |
}, | |
{ | |
"epoch": 10.22, | |
"grad_norm": 0.2599303722381592, | |
"learning_rate": 0.0005353057141175779, | |
"loss": 2.9104, | |
"step": 190000 | |
}, | |
{ | |
"epoch": 10.27, | |
"grad_norm": 0.2843225598335266, | |
"learning_rate": 0.0005323633260754428, | |
"loss": 2.9137, | |
"step": 191000 | |
}, | |
{ | |
"epoch": 10.33, | |
"grad_norm": 0.24611413478851318, | |
"learning_rate": 0.00052942388042135, | |
"loss": 2.9161, | |
"step": 192000 | |
}, | |
{ | |
"epoch": 10.38, | |
"grad_norm": 0.2826565206050873, | |
"learning_rate": 0.000526481492379215, | |
"loss": 2.9175, | |
"step": 193000 | |
}, | |
{ | |
"epoch": 10.43, | |
"grad_norm": 0.28213128447532654, | |
"learning_rate": 0.000523542046725122, | |
"loss": 2.9164, | |
"step": 194000 | |
}, | |
{ | |
"epoch": 10.49, | |
"grad_norm": 0.2640922963619232, | |
"learning_rate": 0.0005205996586829871, | |
"loss": 2.9258, | |
"step": 195000 | |
}, | |
{ | |
"epoch": 10.54, | |
"grad_norm": 0.2534860670566559, | |
"learning_rate": 0.0005176572706408521, | |
"loss": 2.9278, | |
"step": 196000 | |
}, | |
{ | |
"epoch": 10.6, | |
"grad_norm": 0.24981430172920227, | |
"learning_rate": 0.0005147148825987171, | |
"loss": 2.9292, | |
"step": 197000 | |
}, | |
{ | |
"epoch": 10.65, | |
"grad_norm": 0.2640092372894287, | |
"learning_rate": 0.0005117724945565822, | |
"loss": 2.9365, | |
"step": 198000 | |
}, | |
{ | |
"epoch": 10.7, | |
"grad_norm": 0.27058008313179016, | |
"learning_rate": 0.0005088330489024892, | |
"loss": 2.9324, | |
"step": 199000 | |
}, | |
{ | |
"epoch": 10.76, | |
"grad_norm": 0.25028932094573975, | |
"learning_rate": 0.0005058906608603543, | |
"loss": 2.9324, | |
"step": 200000 | |
}, | |
{ | |
"epoch": 10.81, | |
"grad_norm": 0.2733946144580841, | |
"learning_rate": 0.0005029512152062614, | |
"loss": 2.9309, | |
"step": 201000 | |
}, | |
{ | |
"epoch": 10.86, | |
"grad_norm": 0.2877325117588043, | |
"learning_rate": 0.0005000088271641264, | |
"loss": 2.9353, | |
"step": 202000 | |
}, | |
{ | |
"epoch": 10.92, | |
"grad_norm": 0.2410973757505417, | |
"learning_rate": 0.0004970664391219914, | |
"loss": 2.9363, | |
"step": 203000 | |
}, | |
{ | |
"epoch": 10.97, | |
"grad_norm": 0.2605952024459839, | |
"learning_rate": 0.0004941240510798565, | |
"loss": 2.9381, | |
"step": 204000 | |
}, | |
{ | |
"epoch": 11.0, | |
"eval_accuracy": 0.40963281732793566, | |
"eval_loss": 3.3482396602630615, | |
"eval_runtime": 154.276, | |
"eval_samples_per_second": 375.437, | |
"eval_steps_per_second": 5.873, | |
"step": 204523 | |
}, | |
{ | |
"epoch": 11.03, | |
"grad_norm": 0.2494332194328308, | |
"learning_rate": 0.0004911846054257635, | |
"loss": 2.9059, | |
"step": 205000 | |
}, | |
{ | |
"epoch": 11.08, | |
"grad_norm": 0.27014607191085815, | |
"learning_rate": 0.0004882451597716707, | |
"loss": 2.8743, | |
"step": 206000 | |
}, | |
{ | |
"epoch": 11.13, | |
"grad_norm": 0.24216848611831665, | |
"learning_rate": 0.0004853027717295357, | |
"loss": 2.8821, | |
"step": 207000 | |
}, | |
{ | |
"epoch": 11.19, | |
"grad_norm": 0.2651168704032898, | |
"learning_rate": 0.0004823633260754429, | |
"loss": 2.8867, | |
"step": 208000 | |
}, | |
{ | |
"epoch": 11.24, | |
"grad_norm": 0.2583751082420349, | |
"learning_rate": 0.0004794209380333078, | |
"loss": 2.8907, | |
"step": 209000 | |
}, | |
{ | |
"epoch": 11.29, | |
"grad_norm": 0.26034820079803467, | |
"learning_rate": 0.00047647854999117284, | |
"loss": 2.8953, | |
"step": 210000 | |
}, | |
{ | |
"epoch": 11.35, | |
"grad_norm": 0.3137526214122772, | |
"learning_rate": 0.0004735361619490379, | |
"loss": 2.8943, | |
"step": 211000 | |
}, | |
{ | |
"epoch": 11.4, | |
"grad_norm": 0.29816457629203796, | |
"learning_rate": 0.000470596716294945, | |
"loss": 2.8985, | |
"step": 212000 | |
}, | |
{ | |
"epoch": 11.46, | |
"grad_norm": 0.2567712664604187, | |
"learning_rate": 0.00046765727064085213, | |
"loss": 2.9024, | |
"step": 213000 | |
}, | |
{ | |
"epoch": 11.51, | |
"grad_norm": 0.2891099154949188, | |
"learning_rate": 0.0004647148825987171, | |
"loss": 2.9014, | |
"step": 214000 | |
}, | |
{ | |
"epoch": 11.56, | |
"grad_norm": 0.26076436042785645, | |
"learning_rate": 0.00046177249455658215, | |
"loss": 2.9061, | |
"step": 215000 | |
}, | |
{ | |
"epoch": 11.62, | |
"grad_norm": 0.2649165689945221, | |
"learning_rate": 0.00045883010651444713, | |
"loss": 2.9061, | |
"step": 216000 | |
}, | |
{ | |
"epoch": 11.67, | |
"grad_norm": 0.30298641324043274, | |
"learning_rate": 0.0004558877184723121, | |
"loss": 2.9054, | |
"step": 217000 | |
}, | |
{ | |
"epoch": 11.72, | |
"grad_norm": 0.2831864058971405, | |
"learning_rate": 0.00045294827281821924, | |
"loss": 2.9075, | |
"step": 218000 | |
}, | |
{ | |
"epoch": 11.78, | |
"grad_norm": 0.2540026307106018, | |
"learning_rate": 0.00045000882716412637, | |
"loss": 2.9112, | |
"step": 219000 | |
}, | |
{ | |
"epoch": 11.83, | |
"grad_norm": 0.2612316906452179, | |
"learning_rate": 0.0004470664391219914, | |
"loss": 2.9124, | |
"step": 220000 | |
}, | |
{ | |
"epoch": 11.89, | |
"grad_norm": 0.26292499899864197, | |
"learning_rate": 0.00044412405107985644, | |
"loss": 2.9111, | |
"step": 221000 | |
}, | |
{ | |
"epoch": 11.94, | |
"grad_norm": 0.2696852684020996, | |
"learning_rate": 0.0004411816630377214, | |
"loss": 2.9151, | |
"step": 222000 | |
}, | |
{ | |
"epoch": 11.99, | |
"grad_norm": 0.2645638883113861, | |
"learning_rate": 0.00043824221738362855, | |
"loss": 2.9171, | |
"step": 223000 | |
}, | |
{ | |
"epoch": 12.0, | |
"eval_accuracy": 0.41176466241385945, | |
"eval_loss": 3.3354427814483643, | |
"eval_runtime": 153.951, | |
"eval_samples_per_second": 376.23, | |
"eval_steps_per_second": 5.885, | |
"step": 223116 | |
}, | |
{ | |
"epoch": 12.05, | |
"grad_norm": 0.289524108171463, | |
"learning_rate": 0.0004352998293414936, | |
"loss": 2.8568, | |
"step": 224000 | |
}, | |
{ | |
"epoch": 12.1, | |
"grad_norm": 0.28729820251464844, | |
"learning_rate": 0.00043235744129935857, | |
"loss": 2.858, | |
"step": 225000 | |
}, | |
{ | |
"epoch": 12.16, | |
"grad_norm": 0.2749260663986206, | |
"learning_rate": 0.00042941505325722355, | |
"loss": 2.8627, | |
"step": 226000 | |
}, | |
{ | |
"epoch": 12.21, | |
"grad_norm": 0.2578941583633423, | |
"learning_rate": 0.0004264785499911729, | |
"loss": 2.8695, | |
"step": 227000 | |
}, | |
{ | |
"epoch": 12.26, | |
"grad_norm": 0.2701773941516876, | |
"learning_rate": 0.0004235361619490378, | |
"loss": 2.8702, | |
"step": 228000 | |
}, | |
{ | |
"epoch": 12.32, | |
"grad_norm": 0.2578844428062439, | |
"learning_rate": 0.00042059377390690284, | |
"loss": 2.8706, | |
"step": 229000 | |
}, | |
{ | |
"epoch": 12.37, | |
"grad_norm": 0.2564055621623993, | |
"learning_rate": 0.0004176513858647679, | |
"loss": 2.8719, | |
"step": 230000 | |
}, | |
{ | |
"epoch": 12.42, | |
"grad_norm": 0.2670447528362274, | |
"learning_rate": 0.000414711940210675, | |
"loss": 2.8779, | |
"step": 231000 | |
}, | |
{ | |
"epoch": 12.48, | |
"grad_norm": 0.25996309518814087, | |
"learning_rate": 0.00041176955216854, | |
"loss": 2.8834, | |
"step": 232000 | |
}, | |
{ | |
"epoch": 12.53, | |
"grad_norm": 0.2569678723812103, | |
"learning_rate": 0.00040882716412640497, | |
"loss": 2.8813, | |
"step": 233000 | |
}, | |
{ | |
"epoch": 12.59, | |
"grad_norm": 0.25648847222328186, | |
"learning_rate": 0.00040588771847231215, | |
"loss": 2.8887, | |
"step": 234000 | |
}, | |
{ | |
"epoch": 12.64, | |
"grad_norm": 0.27697890996932983, | |
"learning_rate": 0.00040294533043017713, | |
"loss": 2.8892, | |
"step": 235000 | |
}, | |
{ | |
"epoch": 12.69, | |
"grad_norm": 0.2665017247200012, | |
"learning_rate": 0.0004000058847760843, | |
"loss": 2.8903, | |
"step": 236000 | |
}, | |
{ | |
"epoch": 12.75, | |
"grad_norm": 0.27752918004989624, | |
"learning_rate": 0.00039706349673394924, | |
"loss": 2.8866, | |
"step": 237000 | |
}, | |
{ | |
"epoch": 12.8, | |
"grad_norm": 0.28366342186927795, | |
"learning_rate": 0.0003941240510798564, | |
"loss": 2.8901, | |
"step": 238000 | |
}, | |
{ | |
"epoch": 12.85, | |
"grad_norm": 0.268888920545578, | |
"learning_rate": 0.0003911816630377214, | |
"loss": 2.8909, | |
"step": 239000 | |
}, | |
{ | |
"epoch": 12.91, | |
"grad_norm": 0.2524598240852356, | |
"learning_rate": 0.00038823927499558644, | |
"loss": 2.8904, | |
"step": 240000 | |
}, | |
{ | |
"epoch": 12.96, | |
"grad_norm": 0.2472527027130127, | |
"learning_rate": 0.00038529982934149356, | |
"loss": 2.893, | |
"step": 241000 | |
}, | |
{ | |
"epoch": 13.0, | |
"eval_accuracy": 0.4111454102552269, | |
"eval_loss": 3.3519082069396973, | |
"eval_runtime": 153.4319, | |
"eval_samples_per_second": 377.503, | |
"eval_steps_per_second": 5.905, | |
"step": 241709 | |
}, | |
{ | |
"epoch": 13.02, | |
"grad_norm": 0.26184141635894775, | |
"learning_rate": 0.00038235744129935855, | |
"loss": 2.8762, | |
"step": 242000 | |
}, | |
{ | |
"epoch": 13.07, | |
"grad_norm": 0.26451557874679565, | |
"learning_rate": 0.00037941799564526567, | |
"loss": 2.8369, | |
"step": 243000 | |
}, | |
{ | |
"epoch": 13.12, | |
"grad_norm": 0.26714783906936646, | |
"learning_rate": 0.0003764756076031307, | |
"loss": 2.8415, | |
"step": 244000 | |
}, | |
{ | |
"epoch": 13.18, | |
"grad_norm": 0.25869232416152954, | |
"learning_rate": 0.00037353321956099574, | |
"loss": 2.8465, | |
"step": 245000 | |
}, | |
{ | |
"epoch": 13.23, | |
"grad_norm": 0.30912360548973083, | |
"learning_rate": 0.00037059377390690287, | |
"loss": 2.8481, | |
"step": 246000 | |
}, | |
{ | |
"epoch": 13.28, | |
"grad_norm": 0.30928054451942444, | |
"learning_rate": 0.00036765138586476785, | |
"loss": 2.8496, | |
"step": 247000 | |
}, | |
{ | |
"epoch": 13.34, | |
"grad_norm": 0.32625919580459595, | |
"learning_rate": 0.00036470899782263283, | |
"loss": 2.8551, | |
"step": 248000 | |
}, | |
{ | |
"epoch": 13.39, | |
"grad_norm": 0.2953076958656311, | |
"learning_rate": 0.00036176955216854, | |
"loss": 2.8563, | |
"step": 249000 | |
}, | |
{ | |
"epoch": 13.45, | |
"grad_norm": 0.2896451950073242, | |
"learning_rate": 0.000358827164126405, | |
"loss": 2.8619, | |
"step": 250000 | |
}, | |
{ | |
"epoch": 13.5, | |
"grad_norm": 0.271090567111969, | |
"learning_rate": 0.0003558877184723122, | |
"loss": 2.8616, | |
"step": 251000 | |
}, | |
{ | |
"epoch": 13.55, | |
"grad_norm": 0.2772102952003479, | |
"learning_rate": 0.0003529453304301771, | |
"loss": 2.8644, | |
"step": 252000 | |
}, | |
{ | |
"epoch": 13.61, | |
"grad_norm": 0.25150129199028015, | |
"learning_rate": 0.00035000294238804214, | |
"loss": 2.8627, | |
"step": 253000 | |
}, | |
{ | |
"epoch": 13.66, | |
"grad_norm": 0.27617818117141724, | |
"learning_rate": 0.00034706349673394927, | |
"loss": 2.861, | |
"step": 254000 | |
}, | |
{ | |
"epoch": 13.71, | |
"grad_norm": 0.2900349795818329, | |
"learning_rate": 0.0003441211086918143, | |
"loss": 2.8657, | |
"step": 255000 | |
}, | |
{ | |
"epoch": 13.77, | |
"grad_norm": 0.2805399000644684, | |
"learning_rate": 0.0003411787206496793, | |
"loss": 2.871, | |
"step": 256000 | |
}, | |
{ | |
"epoch": 13.82, | |
"grad_norm": 0.2589282691478729, | |
"learning_rate": 0.00033823633260754427, | |
"loss": 2.8672, | |
"step": 257000 | |
}, | |
{ | |
"epoch": 13.88, | |
"grad_norm": 0.291664183139801, | |
"learning_rate": 0.00033529688695345145, | |
"loss": 2.8715, | |
"step": 258000 | |
}, | |
{ | |
"epoch": 13.93, | |
"grad_norm": 0.2693411707878113, | |
"learning_rate": 0.00033235449891131643, | |
"loss": 2.8763, | |
"step": 259000 | |
}, | |
{ | |
"epoch": 13.98, | |
"grad_norm": 0.27113044261932373, | |
"learning_rate": 0.0003294150532572236, | |
"loss": 2.876, | |
"step": 260000 | |
}, | |
{ | |
"epoch": 14.0, | |
"eval_accuracy": 0.41093769408311887, | |
"eval_loss": 3.3631937503814697, | |
"eval_runtime": 153.9063, | |
"eval_samples_per_second": 376.339, | |
"eval_steps_per_second": 5.887, | |
"step": 260302 | |
}, | |
{ | |
"epoch": 14.04, | |
"grad_norm": 0.2805856168270111, | |
"learning_rate": 0.00032647266521508854, | |
"loss": 2.8345, | |
"step": 261000 | |
}, | |
{ | |
"epoch": 14.09, | |
"grad_norm": 0.2932230234146118, | |
"learning_rate": 0.00032353321956099567, | |
"loss": 2.8245, | |
"step": 262000 | |
}, | |
{ | |
"epoch": 14.15, | |
"grad_norm": 0.2711317837238312, | |
"learning_rate": 0.0003205908315188607, | |
"loss": 2.8265, | |
"step": 263000 | |
}, | |
{ | |
"epoch": 14.2, | |
"grad_norm": 0.30451709032058716, | |
"learning_rate": 0.00031765138586476783, | |
"loss": 2.8294, | |
"step": 264000 | |
}, | |
{ | |
"epoch": 14.25, | |
"grad_norm": 0.2669803500175476, | |
"learning_rate": 0.00031470899782263287, | |
"loss": 2.8349, | |
"step": 265000 | |
}, | |
{ | |
"epoch": 14.31, | |
"grad_norm": 0.3362666070461273, | |
"learning_rate": 0.00031176660978049785, | |
"loss": 2.8385, | |
"step": 266000 | |
}, | |
{ | |
"epoch": 14.36, | |
"grad_norm": 0.2826295495033264, | |
"learning_rate": 0.000308827164126405, | |
"loss": 2.8351, | |
"step": 267000 | |
}, | |
{ | |
"epoch": 14.41, | |
"grad_norm": 0.29114797711372375, | |
"learning_rate": 0.00030588477608427, | |
"loss": 2.8372, | |
"step": 268000 | |
}, | |
{ | |
"epoch": 14.47, | |
"grad_norm": 0.29871678352355957, | |
"learning_rate": 0.00030294533043017714, | |
"loss": 2.8397, | |
"step": 269000 | |
}, | |
{ | |
"epoch": 14.52, | |
"grad_norm": 0.2672843933105469, | |
"learning_rate": 0.0003000029423880422, | |
"loss": 2.8448, | |
"step": 270000 | |
}, | |
{ | |
"epoch": 14.58, | |
"grad_norm": 0.25932183861732483, | |
"learning_rate": 0.0002970605543459071, | |
"loss": 2.8443, | |
"step": 271000 | |
}, | |
{ | |
"epoch": 14.63, | |
"grad_norm": 0.2911001443862915, | |
"learning_rate": 0.00029411816630377214, | |
"loss": 2.8493, | |
"step": 272000 | |
}, | |
{ | |
"epoch": 14.68, | |
"grad_norm": 0.2664991617202759, | |
"learning_rate": 0.00029117577826163717, | |
"loss": 2.8422, | |
"step": 273000 | |
}, | |
{ | |
"epoch": 14.74, | |
"grad_norm": 0.30084219574928284, | |
"learning_rate": 0.0002882363326075443, | |
"loss": 2.8505, | |
"step": 274000 | |
}, | |
{ | |
"epoch": 14.79, | |
"grad_norm": 0.2992892861366272, | |
"learning_rate": 0.0002852939445654093, | |
"loss": 2.8497, | |
"step": 275000 | |
}, | |
{ | |
"epoch": 14.84, | |
"grad_norm": 0.335448682308197, | |
"learning_rate": 0.0002823544989113164, | |
"loss": 2.8529, | |
"step": 276000 | |
}, | |
{ | |
"epoch": 14.9, | |
"grad_norm": 0.28675752878189087, | |
"learning_rate": 0.00027941211086918144, | |
"loss": 2.8497, | |
"step": 277000 | |
}, | |
{ | |
"epoch": 14.95, | |
"grad_norm": 0.3506409525871277, | |
"learning_rate": 0.00027647266521508857, | |
"loss": 2.8472, | |
"step": 278000 | |
}, | |
{ | |
"epoch": 15.0, | |
"eval_accuracy": 0.41203740745226775, | |
"eval_loss": 3.3530044555664062, | |
"eval_runtime": 154.3507, | |
"eval_samples_per_second": 375.256, | |
"eval_steps_per_second": 5.87, | |
"step": 278895 | |
}, | |
{ | |
"epoch": 15.01, | |
"grad_norm": 0.2708049416542053, | |
"learning_rate": 0.0002735302771729536, | |
"loss": 2.8479, | |
"step": 279000 | |
}, | |
{ | |
"epoch": 15.06, | |
"grad_norm": 0.2830442190170288, | |
"learning_rate": 0.00027059083151886073, | |
"loss": 2.8018, | |
"step": 280000 | |
}, | |
{ | |
"epoch": 15.11, | |
"grad_norm": 0.2702392041683197, | |
"learning_rate": 0.0002676484434767257, | |
"loss": 2.805, | |
"step": 281000 | |
}, | |
{ | |
"epoch": 15.17, | |
"grad_norm": 0.3452757000923157, | |
"learning_rate": 0.00026470899782263284, | |
"loss": 2.8103, | |
"step": 282000 | |
}, | |
{ | |
"epoch": 15.22, | |
"grad_norm": 0.28475674986839294, | |
"learning_rate": 0.0002617666097804979, | |
"loss": 2.8141, | |
"step": 283000 | |
}, | |
{ | |
"epoch": 15.27, | |
"grad_norm": 0.2740528881549835, | |
"learning_rate": 0.00025882422173836286, | |
"loss": 2.8162, | |
"step": 284000 | |
}, | |
{ | |
"epoch": 15.33, | |
"grad_norm": 0.3067547380924225, | |
"learning_rate": 0.00025588183369622784, | |
"loss": 2.8182, | |
"step": 285000 | |
}, | |
{ | |
"epoch": 15.38, | |
"grad_norm": 0.3173828721046448, | |
"learning_rate": 0.00025294238804213497, | |
"loss": 2.8179, | |
"step": 286000 | |
}, | |
{ | |
"epoch": 15.44, | |
"grad_norm": 0.32790786027908325, | |
"learning_rate": 0.00025, | |
"loss": 2.8222, | |
"step": 287000 | |
}, | |
{ | |
"epoch": 15.49, | |
"grad_norm": 0.3280278444290161, | |
"learning_rate": 0.00024706055434590713, | |
"loss": 2.8263, | |
"step": 288000 | |
}, | |
{ | |
"epoch": 15.54, | |
"grad_norm": 0.30356574058532715, | |
"learning_rate": 0.00024411816630377214, | |
"loss": 2.826, | |
"step": 289000 | |
}, | |
{ | |
"epoch": 15.6, | |
"grad_norm": 0.2901437282562256, | |
"learning_rate": 0.00024117577826163715, | |
"loss": 2.8254, | |
"step": 290000 | |
}, | |
{ | |
"epoch": 15.65, | |
"grad_norm": 0.34759074449539185, | |
"learning_rate": 0.0002382363326075443, | |
"loss": 2.8291, | |
"step": 291000 | |
}, | |
{ | |
"epoch": 15.7, | |
"grad_norm": 0.38021716475486755, | |
"learning_rate": 0.00023529394456540929, | |
"loss": 2.8286, | |
"step": 292000 | |
}, | |
{ | |
"epoch": 15.76, | |
"grad_norm": 0.29485049843788147, | |
"learning_rate": 0.0002323515565232743, | |
"loss": 2.8312, | |
"step": 293000 | |
}, | |
{ | |
"epoch": 15.81, | |
"grad_norm": 0.27850472927093506, | |
"learning_rate": 0.00022941211086918142, | |
"loss": 2.8346, | |
"step": 294000 | |
}, | |
{ | |
"epoch": 15.87, | |
"grad_norm": 0.2848072052001953, | |
"learning_rate": 0.00022646972282704646, | |
"loss": 2.8291, | |
"step": 295000 | |
}, | |
{ | |
"epoch": 15.92, | |
"grad_norm": 0.3154236674308777, | |
"learning_rate": 0.00022353027717295358, | |
"loss": 2.8338, | |
"step": 296000 | |
}, | |
{ | |
"epoch": 15.97, | |
"grad_norm": 0.2986476719379425, | |
"learning_rate": 0.00022058788913081857, | |
"loss": 2.8335, | |
"step": 297000 | |
}, | |
{ | |
"epoch": 16.0, | |
"eval_accuracy": 0.41131288645738256, | |
"eval_loss": 3.3726773262023926, | |
"eval_runtime": 153.5545, | |
"eval_samples_per_second": 377.201, | |
"eval_steps_per_second": 5.9, | |
"step": 297488 | |
}, | |
{ | |
"epoch": 16.03, | |
"grad_norm": 0.31234875321388245, | |
"learning_rate": 0.00021764844347672572, | |
"loss": 2.8076, | |
"step": 298000 | |
}, | |
{ | |
"epoch": 16.08, | |
"grad_norm": 0.30626383423805237, | |
"learning_rate": 0.0002147060554345907, | |
"loss": 2.7883, | |
"step": 299000 | |
}, | |
{ | |
"epoch": 16.14, | |
"grad_norm": 0.2825140655040741, | |
"learning_rate": 0.00021176366739245574, | |
"loss": 2.7947, | |
"step": 300000 | |
}, | |
{ | |
"epoch": 16.19, | |
"grad_norm": 0.28852617740631104, | |
"learning_rate": 0.00020882422173836286, | |
"loss": 2.795, | |
"step": 301000 | |
}, | |
{ | |
"epoch": 16.24, | |
"grad_norm": 0.31543779373168945, | |
"learning_rate": 0.00020588477608427, | |
"loss": 2.798, | |
"step": 302000 | |
}, | |
{ | |
"epoch": 16.3, | |
"grad_norm": 0.2947680652141571, | |
"learning_rate": 0.000202942388042135, | |
"loss": 2.8025, | |
"step": 303000 | |
}, | |
{ | |
"epoch": 16.35, | |
"grad_norm": 0.291638046503067, | |
"learning_rate": 0.0002, | |
"loss": 2.8015, | |
"step": 304000 | |
}, | |
{ | |
"epoch": 16.4, | |
"grad_norm": 0.29190531373023987, | |
"learning_rate": 0.00019705761195786502, | |
"loss": 2.8063, | |
"step": 305000 | |
}, | |
{ | |
"epoch": 16.46, | |
"grad_norm": 0.3504582643508911, | |
"learning_rate": 0.00019411522391573, | |
"loss": 2.8084, | |
"step": 306000 | |
}, | |
{ | |
"epoch": 16.51, | |
"grad_norm": 0.2807118892669678, | |
"learning_rate": 0.000191172835873595, | |
"loss": 2.8028, | |
"step": 307000 | |
}, | |
{ | |
"epoch": 16.57, | |
"grad_norm": 0.29211023449897766, | |
"learning_rate": 0.00018823339021950214, | |
"loss": 2.8073, | |
"step": 308000 | |
}, | |
{ | |
"epoch": 16.62, | |
"grad_norm": 0.28356799483299255, | |
"learning_rate": 0.0001852939445654093, | |
"loss": 2.8114, | |
"step": 309000 | |
}, | |
{ | |
"epoch": 16.67, | |
"grad_norm": 0.32647237181663513, | |
"learning_rate": 0.0001823515565232743, | |
"loss": 2.809, | |
"step": 310000 | |
}, | |
{ | |
"epoch": 16.73, | |
"grad_norm": 0.3036649227142334, | |
"learning_rate": 0.0001794091684811393, | |
"loss": 2.8088, | |
"step": 311000 | |
}, | |
{ | |
"epoch": 16.78, | |
"grad_norm": 0.3010052740573883, | |
"learning_rate": 0.0001764667804390043, | |
"loss": 2.8145, | |
"step": 312000 | |
}, | |
{ | |
"epoch": 16.83, | |
"grad_norm": 0.2830561101436615, | |
"learning_rate": 0.00017352733478491144, | |
"loss": 2.8093, | |
"step": 313000 | |
}, | |
{ | |
"epoch": 16.89, | |
"grad_norm": 0.3039667308330536, | |
"learning_rate": 0.00017058494674277645, | |
"loss": 2.8143, | |
"step": 314000 | |
}, | |
{ | |
"epoch": 16.94, | |
"grad_norm": 0.3185649514198303, | |
"learning_rate": 0.00016764255870064143, | |
"loss": 2.8144, | |
"step": 315000 | |
}, | |
{ | |
"epoch": 17.0, | |
"grad_norm": 0.2941153049468994, | |
"learning_rate": 0.00016470017065850644, | |
"loss": 2.812, | |
"step": 316000 | |
}, | |
{ | |
"epoch": 17.0, | |
"eval_accuracy": 0.41104343316943776, | |
"eval_loss": 3.3814456462860107, | |
"eval_runtime": 153.7991, | |
"eval_samples_per_second": 376.602, | |
"eval_steps_per_second": 5.891, | |
"step": 316081 | |
}, | |
{ | |
"epoch": 17.05, | |
"grad_norm": 0.30029386281967163, | |
"learning_rate": 0.00016176366739245572, | |
"loss": 2.7825, | |
"step": 317000 | |
}, | |
{ | |
"epoch": 17.1, | |
"grad_norm": 0.2988363206386566, | |
"learning_rate": 0.00015882127935032072, | |
"loss": 2.7782, | |
"step": 318000 | |
}, | |
{ | |
"epoch": 17.16, | |
"grad_norm": 0.31658461689949036, | |
"learning_rate": 0.00015588183369622785, | |
"loss": 2.7813, | |
"step": 319000 | |
}, | |
{ | |
"epoch": 17.21, | |
"grad_norm": 0.3059552311897278, | |
"learning_rate": 0.0001529394456540929, | |
"loss": 2.7835, | |
"step": 320000 | |
}, | |
{ | |
"epoch": 17.26, | |
"grad_norm": 0.30100756883621216, | |
"learning_rate": 0.00014999705761195787, | |
"loss": 2.7827, | |
"step": 321000 | |
}, | |
{ | |
"epoch": 17.32, | |
"grad_norm": 0.30339285731315613, | |
"learning_rate": 0.00014705466956982285, | |
"loss": 2.7887, | |
"step": 322000 | |
}, | |
{ | |
"epoch": 17.37, | |
"grad_norm": 0.3006446957588196, | |
"learning_rate": 0.00014411522391573, | |
"loss": 2.7872, | |
"step": 323000 | |
}, | |
{ | |
"epoch": 17.43, | |
"grad_norm": 0.28081661462783813, | |
"learning_rate": 0.000141172835873595, | |
"loss": 2.7869, | |
"step": 324000 | |
}, | |
{ | |
"epoch": 17.48, | |
"grad_norm": 0.28526371717453003, | |
"learning_rate": 0.00013823339021950214, | |
"loss": 2.7881, | |
"step": 325000 | |
}, | |
{ | |
"epoch": 17.53, | |
"grad_norm": 0.31043946743011475, | |
"learning_rate": 0.00013529100217736715, | |
"loss": 2.7894, | |
"step": 326000 | |
}, | |
{ | |
"epoch": 17.59, | |
"grad_norm": 0.3015703558921814, | |
"learning_rate": 0.00013234861413523216, | |
"loss": 2.79, | |
"step": 327000 | |
}, | |
{ | |
"epoch": 17.64, | |
"grad_norm": 0.29077813029289246, | |
"learning_rate": 0.00012940916848113928, | |
"loss": 2.7921, | |
"step": 328000 | |
}, | |
{ | |
"epoch": 17.69, | |
"grad_norm": 0.29008862376213074, | |
"learning_rate": 0.00012646972282704644, | |
"loss": 2.7921, | |
"step": 329000 | |
}, | |
{ | |
"epoch": 17.75, | |
"grad_norm": 0.29531145095825195, | |
"learning_rate": 0.00012352733478491142, | |
"loss": 2.7908, | |
"step": 330000 | |
}, | |
{ | |
"epoch": 17.8, | |
"grad_norm": 0.3173753619194031, | |
"learning_rate": 0.00012058494674277644, | |
"loss": 2.7954, | |
"step": 331000 | |
}, | |
{ | |
"epoch": 17.86, | |
"grad_norm": 0.28667834401130676, | |
"learning_rate": 0.00011764550108868357, | |
"loss": 2.791, | |
"step": 332000 | |
}, | |
{ | |
"epoch": 17.91, | |
"grad_norm": 0.3025910556316376, | |
"learning_rate": 0.00011470311304654858, | |
"loss": 2.7956, | |
"step": 333000 | |
}, | |
{ | |
"epoch": 17.96, | |
"grad_norm": 0.32427656650543213, | |
"learning_rate": 0.00011176366739245572, | |
"loss": 2.7944, | |
"step": 334000 | |
}, | |
{ | |
"epoch": 18.0, | |
"eval_accuracy": 0.4119177623997049, | |
"eval_loss": 3.3788328170776367, | |
"eval_runtime": 153.909, | |
"eval_samples_per_second": 376.333, | |
"eval_steps_per_second": 5.887, | |
"step": 334674 | |
}, | |
{ | |
"epoch": 18.02, | |
"grad_norm": 0.28937357664108276, | |
"learning_rate": 0.00010882127935032073, | |
"loss": 2.785, | |
"step": 335000 | |
}, | |
{ | |
"epoch": 18.07, | |
"grad_norm": 0.3173358738422394, | |
"learning_rate": 0.00010588183369622787, | |
"loss": 2.7681, | |
"step": 336000 | |
}, | |
{ | |
"epoch": 18.13, | |
"grad_norm": 0.3219173550605774, | |
"learning_rate": 0.00010293944565409286, | |
"loss": 2.7649, | |
"step": 337000 | |
}, | |
{ | |
"epoch": 18.18, | |
"grad_norm": 0.30825838446617126, | |
"learning_rate": 0.0001, | |
"loss": 2.7678, | |
"step": 338000 | |
}, | |
{ | |
"epoch": 18.23, | |
"grad_norm": 0.2984776198863983, | |
"learning_rate": 9.7057611957865e-05, | |
"loss": 2.7691, | |
"step": 339000 | |
}, | |
{ | |
"epoch": 18.29, | |
"grad_norm": 0.3033863604068756, | |
"learning_rate": 9.411522391573001e-05, | |
"loss": 2.7718, | |
"step": 340000 | |
}, | |
{ | |
"epoch": 18.34, | |
"grad_norm": 0.298531711101532, | |
"learning_rate": 9.117283587359502e-05, | |
"loss": 2.7688, | |
"step": 341000 | |
}, | |
{ | |
"epoch": 18.39, | |
"grad_norm": 0.37657690048217773, | |
"learning_rate": 8.823339021950214e-05, | |
"loss": 2.7737, | |
"step": 342000 | |
}, | |
{ | |
"epoch": 18.45, | |
"grad_norm": 0.30152538418769836, | |
"learning_rate": 8.529394456540929e-05, | |
"loss": 2.7705, | |
"step": 343000 | |
}, | |
{ | |
"epoch": 18.5, | |
"grad_norm": 0.2887534499168396, | |
"learning_rate": 8.23515565232743e-05, | |
"loss": 2.7716, | |
"step": 344000 | |
}, | |
{ | |
"epoch": 18.56, | |
"grad_norm": 0.31988152861595154, | |
"learning_rate": 7.94091684811393e-05, | |
"loss": 2.7738, | |
"step": 345000 | |
}, | |
{ | |
"epoch": 18.61, | |
"grad_norm": 0.2952527105808258, | |
"learning_rate": 7.646972282704644e-05, | |
"loss": 2.7732, | |
"step": 346000 | |
}, | |
{ | |
"epoch": 18.66, | |
"grad_norm": 0.318419873714447, | |
"learning_rate": 7.352733478491143e-05, | |
"loss": 2.7787, | |
"step": 347000 | |
}, | |
{ | |
"epoch": 18.72, | |
"grad_norm": 0.290344774723053, | |
"learning_rate": 7.058494674277643e-05, | |
"loss": 2.7724, | |
"step": 348000 | |
}, | |
{ | |
"epoch": 18.77, | |
"grad_norm": 0.34806349873542786, | |
"learning_rate": 6.764550108868357e-05, | |
"loss": 2.7785, | |
"step": 349000 | |
}, | |
{ | |
"epoch": 18.82, | |
"grad_norm": 0.27911612391471863, | |
"learning_rate": 6.470311304654858e-05, | |
"loss": 2.7754, | |
"step": 350000 | |
}, | |
{ | |
"epoch": 18.88, | |
"grad_norm": 0.36661508679389954, | |
"learning_rate": 6.176072500441358e-05, | |
"loss": 2.7737, | |
"step": 351000 | |
}, | |
{ | |
"epoch": 18.93, | |
"grad_norm": 0.2794508635997772, | |
"learning_rate": 5.881833696227858e-05, | |
"loss": 2.7796, | |
"step": 352000 | |
}, | |
{ | |
"epoch": 18.99, | |
"grad_norm": 0.3130718171596527, | |
"learning_rate": 5.587889130818573e-05, | |
"loss": 2.7761, | |
"step": 353000 | |
}, | |
{ | |
"epoch": 19.0, | |
"eval_accuracy": 0.4113871859678455, | |
"eval_loss": 3.3925299644470215, | |
"eval_runtime": 155.9595, | |
"eval_samples_per_second": 371.385, | |
"eval_steps_per_second": 5.809, | |
"step": 353267 | |
}, | |
{ | |
"epoch": 19.04, | |
"grad_norm": 0.2997041940689087, | |
"learning_rate": 5.293650326605073e-05, | |
"loss": 2.7625, | |
"step": 354000 | |
}, | |
{ | |
"epoch": 19.09, | |
"grad_norm": 0.31620392203330994, | |
"learning_rate": 4.999411522391573e-05, | |
"loss": 2.7545, | |
"step": 355000 | |
}, | |
{ | |
"epoch": 19.15, | |
"grad_norm": 0.3096230626106262, | |
"learning_rate": 4.7051727181780734e-05, | |
"loss": 2.7559, | |
"step": 356000 | |
}, | |
{ | |
"epoch": 19.2, | |
"grad_norm": 0.29621490836143494, | |
"learning_rate": 4.4115223915730006e-05, | |
"loss": 2.7603, | |
"step": 357000 | |
}, | |
{ | |
"epoch": 19.25, | |
"grad_norm": 0.3432357609272003, | |
"learning_rate": 4.117283587359501e-05, | |
"loss": 2.7593, | |
"step": 358000 | |
}, | |
{ | |
"epoch": 19.31, | |
"grad_norm": 0.32921677827835083, | |
"learning_rate": 3.823044783146001e-05, | |
"loss": 2.7558, | |
"step": 359000 | |
}, | |
{ | |
"epoch": 19.36, | |
"grad_norm": 0.3113718032836914, | |
"learning_rate": 3.528805978932502e-05, | |
"loss": 2.7584, | |
"step": 360000 | |
}, | |
{ | |
"epoch": 19.42, | |
"grad_norm": 0.29590970277786255, | |
"learning_rate": 3.234861413523216e-05, | |
"loss": 2.7579, | |
"step": 361000 | |
}, | |
{ | |
"epoch": 19.47, | |
"grad_norm": 0.3046259582042694, | |
"learning_rate": 2.940622609309716e-05, | |
"loss": 2.757, | |
"step": 362000 | |
}, | |
{ | |
"epoch": 19.52, | |
"grad_norm": 0.31513723731040955, | |
"learning_rate": 2.64667804390043e-05, | |
"loss": 2.7601, | |
"step": 363000 | |
}, | |
{ | |
"epoch": 19.58, | |
"grad_norm": 0.31137752532958984, | |
"learning_rate": 2.35243923968693e-05, | |
"loss": 2.7618, | |
"step": 364000 | |
}, | |
{ | |
"epoch": 19.63, | |
"grad_norm": 0.3485892117023468, | |
"learning_rate": 2.0582004354734303e-05, | |
"loss": 2.7603, | |
"step": 365000 | |
}, | |
{ | |
"epoch": 19.68, | |
"grad_norm": 0.3177970051765442, | |
"learning_rate": 1.7639616312599305e-05, | |
"loss": 2.7573, | |
"step": 366000 | |
}, | |
{ | |
"epoch": 19.74, | |
"grad_norm": 0.3040596842765808, | |
"learning_rate": 1.4700170658506444e-05, | |
"loss": 2.7569, | |
"step": 367000 | |
}, | |
{ | |
"epoch": 19.79, | |
"grad_norm": 0.31309446692466736, | |
"learning_rate": 1.1757782616371448e-05, | |
"loss": 2.7592, | |
"step": 368000 | |
}, | |
{ | |
"epoch": 19.85, | |
"grad_norm": 0.3122311234474182, | |
"learning_rate": 8.815394574236451e-06, | |
"loss": 2.761, | |
"step": 369000 | |
}, | |
{ | |
"epoch": 19.9, | |
"grad_norm": 0.30037108063697815, | |
"learning_rate": 5.8759489201435885e-06, | |
"loss": 2.7553, | |
"step": 370000 | |
}, | |
{ | |
"epoch": 19.95, | |
"grad_norm": 0.33223336935043335, | |
"learning_rate": 2.933560878008592e-06, | |
"loss": 2.7606, | |
"step": 371000 | |
}, | |
{ | |
"epoch": 20.0, | |
"eval_accuracy": 0.41125847180686265, | |
"eval_loss": 3.397514581680298, | |
"eval_runtime": 153.9424, | |
"eval_samples_per_second": 376.251, | |
"eval_steps_per_second": 5.885, | |
"step": 371860 | |
}, | |
{ | |
"epoch": 20.0, | |
"step": 371860, | |
"total_flos": 1.56674405385216e+18, | |
"train_loss": 3.027371880730273, | |
"train_runtime": 81125.0441, | |
"train_samples_per_second": 146.679, | |
"train_steps_per_second": 4.584 | |
} | |
], | |
"logging_steps": 1000, | |
"max_steps": 371860, | |
"num_input_tokens_seen": 0, | |
"num_train_epochs": 20, | |
"save_steps": 5000, | |
"total_flos": 1.56674405385216e+18, | |
"train_batch_size": 32, | |
"trial_name": null, | |
"trial_params": null | |
} | |