|
|
|
0%| | 0/1090 [00:00<?, ?it/s][WARNING|logging.py:314] 2024-02-01 17:38:34,491 >> You're using a LlamaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding. |
|
{'loss': 13.9185, 'learning_rate': 1.8348623853211012e-07, 'epoch': 0.0} |
|
0%|β | 3/1090 [00:09<43:27, 2.40s/it] |
|
|
|
|
|
|
|
1%|β | 9/1090 [00:14<18:27, 1.02s/it] |
|
|
|
|
|
1%|β | 14/1090 [00:19<16:33, 1.08it/s] |
|
|
|
|
|
2%|ββ | 18/1090 [00:22<16:14, 1.10it/s] |
|
|
|
|
|
2%|ββ | 23/1090 [00:27<16:05, 1.11it/s] |
|
|
|
|
|
|
|
3%|βββ | 29/1090 [00:32<16:02, 1.10it/s] |
|
|
|
|
|
3%|βββ | 34/1090 [00:37<15:58, 1.10it/s] |
|
|
|
|
|
3%|βββ | 38/1090 [00:41<15:53, 1.10it/s] |
|
|
|
|
|
4%|βββ | 42/1090 [00:44<15:50, 1.10it/s] |
|
|
|
|
|
|
|
4%|ββββ | 49/1090 [00:51<15:46, 1.10it/s] |
|
|
|
|
|
5%|ββββ | 54/1090 [00:55<15:42, 1.10it/s] |
|
|
|
|
|
5%|βββββ | 58/1090 [00:59<15:38, 1.10it/s] |
|
|
|
|
|
|
|
6%|βββββ | 64/1090 [01:04<15:34, 1.10it/s] |
|
|
|
|
|
6%|ββββββ | 69/1090 [01:09<15:30, 1.10it/s] |
|
|
|
|
|
7%|ββββββ | 73/1090 [01:12<15:26, 1.10it/s] |
|
|
|
|
|
|
|
7%|ββββββ | 80/1090 [01:19<15:18, 1.10it/s] |
|
|
|
|
|
8%|βββββββ | 84/1090 [01:22<15:16, 1.10it/s] |
|
|
|
|
|
8%|βββββββ | 89/1090 [01:27<15:09, 1.10it/s] |
|
|
|
|
|
9%|βββββββ | 93/1090 [01:31<15:08, 1.10it/s] |
|
|
|
|
|
|
|
9%|ββββββββ | 100/1090 [01:37<14:59, 1.10it/s] |
|
9%|ββββββββ | 100/1090 [01:37<14:59, 1.10it/s][INFO|trainer.py:2889] 2024-02-01 17:40:13,014 >> Saving model checkpoint to ./tmp-checkpoint-100 |
|
[INFO|configuration_utils.py:483] 2024-02-01 17:40:13,018 >> Configuration saved in ./tmp-checkpoint-100/config.json |
|
[INFO|configuration_utils.py:594] 2024-02-01 17:40:13,020 >> Configuration saved in ./tmp-checkpoint-100/generation_config.json |
|
[INFO|modeling_utils.py:2382] 2024-02-01 17:40:16,055 >> Model weights saved in ./tmp-checkpoint-100/pytorch_model.bin |
|
[INFO|tokenization_utils_base.py:2432] 2024-02-01 17:40:16,059 >> tokenizer config file saved in ./tmp-checkpoint-100/tokenizer_config.json |
|
[INFO|tokenization_utils_base.py:2441] 2024-02-01 17:40:16,061 >> Special tokens file saved in ./tmp-checkpoint-100/special_tokens_map.json |
|
/fsx/sanchit/miniconda3/envs/venv/lib/python3.11/site-packages/torch/nn/modules/module.py:1879: UserWarning: Positional args are being deprecated, use kwargs instead. Refer to https://pytorch.org/docs/master/generated/torch.nn.Module.html#torch.nn.Module.state_dict for details. |
|
warnings.warn( |
|
[2024-02-01 17:40:16,087] [INFO] [logging.py:96:log_dist] [Rank 0] [Torch] Checkpoint global_step100 is about to be saved! |
|
[2024-02-01 17:40:16,093] [INFO] [logging.py:96:log_dist] [Rank 0] Saving model checkpoint: ./tmp-checkpoint-100/global_step100/zero_pp_rank_0_mp_rank_00_model_states.pt |
|
[2024-02-01 17:40:16,093] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving ./tmp-checkpoint-100/global_step100/zero_pp_rank_0_mp_rank_00_model_states.pt... |
|
[2024-02-01 17:40:16,210] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved ./tmp-checkpoint-100/global_step100/zero_pp_rank_0_mp_rank_00_model_states.pt. |
|
[2024-02-01 17:40:16,214] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving ./tmp-checkpoint-100/global_step100/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt... |
|
[2024-02-01 17:40:19,957] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved ./tmp-checkpoint-100/global_step100/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt. |
|
[2024-02-01 17:40:19,962] [INFO] [engine.py:3393:_save_zero_checkpoint] zero checkpoint saved ./tmp-checkpoint-100/global_step100/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt |
|
[2024-02-01 17:40:20,277] [INFO] [torch_checkpoint_engine.py:33:commit] [Torch] Checkpoint global_step100 is ready now! |
|
[INFO|tokenization_utils_base.py:2432] 2024-02-01 17:40:22,999 >> tokenizer config file saved in ./tokenizer_config.json |
|
[INFO|tokenization_utils_base.py:2441] 2024-02-01 17:40:23,001 >> Special tokens file saved in ./special_tokens_map.json |
|
|
|
9%|ββββββββ | 103/1090 [01:51<41:42, 2.54s/it] |
|
|
|
|
|
|
|
10%|ββββββββ | 110/1090 [01:57<17:02, 1.04s/it] |
|
|
|
|
|
10%|βββββββββ | 114/1090 [02:01<15:18, 1.06it/s] |
|
|
|
|
|
11%|βββββββββ | 118/1090 [02:04<14:52, 1.09it/s] |
|
|
|
|
|
|
|
11%|ββββββββββ | 125/1090 [02:11<14:39, 1.10it/s] |
|
|
|
|
|
12%|ββββββββββ | 129/1090 [02:14<14:34, 1.10it/s] |
|
|
|
|
|
12%|ββββββββββ | 134/1090 [02:19<14:31, 1.10it/s] |
|
|
|
|
|
13%|βββββββββββ | 138/1090 [02:23<14:26, 1.10it/s] |
|
|
|
|
|
|
|
13%|βββββββββββ | 145/1090 [02:29<14:19, 1.10it/s] |
|
|
|
|
|
14%|βββββββββββ | 149/1090 [02:33<14:17, 1.10it/s] |
|
|
|
|
|
14%|ββββββββββββ | 153/1090 [02:36<14:14, 1.10it/s] |
|
|
|
|
|
|
|
15%|ββββββββββββ | 160/1090 [02:43<14:08, 1.10it/s] |
|
|
|
|
|
15%|ββββββββββββ | 164/1090 [02:46<14:07, 1.09it/s] |
|
|
|
|
|
16%|βββββββββββββ | 169/1090 [02:51<14:01, 1.09it/s] |
|
|
|
|
|
16%|βββββββββββββ | 173/1090 [02:55<13:57, 1.10it/s] |
|
|
|
|
|
|
|
17%|ββββββββββββββ | 180/1090 [03:01<13:51, 1.09it/s] |
|
|
|
|
|
17%|ββββββββββββββ | 184/1090 [03:05<13:47, 1.09it/s] |
|
|
|
|
|
17%|ββββββββββββββ | 188/1090 [03:08<13:44, 1.09it/s] |
|
|
|
|
|
18%|βββββββββββββββ | 193/1090 [03:13<13:39, 1.09it/s] |
|
|
|
|
|
|
|
18%|βββββββββββββββ | 199/1090 [03:18<13:34, 1.09it/s] |
|
18%|βββββββββββββββ | 200/1090 [03:19<13:32, 1.10it/s][INFO|trainer.py:2889] 2024-02-01 17:41:55,274 >> Saving model checkpoint to ./tmp-checkpoint-200 |
|
[INFO|configuration_utils.py:483] 2024-02-01 17:41:55,278 >> Configuration saved in ./tmp-checkpoint-200/config.json |
|
[INFO|configuration_utils.py:594] 2024-02-01 17:41:55,280 >> Configuration saved in ./tmp-checkpoint-200/generation_config.json |
|
[2024-02-01 17:41:58,337] [INFO] [logging.py:96:log_dist] [Rank 0] [Torch] Checkpoint global_step200 is about to be saved! |
|
[2024-02-01 17:41:58,341] [INFO] [logging.py:96:log_dist] [Rank 0] Saving model checkpoint: ./tmp-checkpoint-200/global_step200/zero_pp_rank_0_mp_rank_00_model_states.pt |
|
[2024-02-01 17:41:58,341] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving ./tmp-checkpoint-200/global_step200/zero_pp_rank_0_mp_rank_00_model_states.pt... |
|
[2024-02-01 17:41:58,452] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved ./tmp-checkpoint-200/global_step200/zero_pp_rank_0_mp_rank_00_model_states.pt. |
|
[2024-02-01 17:41:58,456] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving ./tmp-checkpoint-200/global_step200/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt... |
|
[INFO|modeling_utils.py:2382] 2024-02-01 17:41:58,307 >> Model weights saved in ./tmp-checkpoint-200/pytorch_model.bin |
|
[INFO|tokenization_utils_base.py:2432] 2024-02-01 17:41:58,310 >> tokenizer config file saved in ./tmp-checkpoint-200/tokenizer_config.json |
|
[INFO|tokenization_utils_base.py:2441] 2024-02-01 17:41:58,312 >> Special tokens file saved in ./tmp-checkpoint-200/special_tokens_map.json |
|
/fsx/sanchit/miniconda3/envs/venv/lib/python3.11/site-packages/torch/nn/modules/module.py:1879: UserWarning: Positional args are being deprecated, use kwargs instead. Refer to https://pytorch.org/docs/master/generated/torch.nn.Module.html#torch.nn.Module.state_dict for details. |
|
warnings.warn( |
|
[2024-02-01 17:42:02,208] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved ./tmp-checkpoint-200/global_step200/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt. |
|
[2024-02-01 17:42:02,213] [INFO] [engine.py:3393:_save_zero_checkpoint] zero checkpoint saved ./tmp-checkpoint-200/global_step200/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt |
|
[2024-02-01 17:42:02,552] [INFO] [torch_checkpoint_engine.py:33:commit] [Torch] Checkpoint global_step200 is ready now! |
|
[INFO|tokenization_utils_base.py:2432] 2024-02-01 17:42:05,523 >> tokenizer config file saved in ./tokenizer_config.json |
|
[INFO|tokenization_utils_base.py:2441] 2024-02-01 17:42:05,525 >> Special tokens file saved in ./special_tokens_map.json |
|
[INFO|trainer.py:2979] 2024-02-01 17:42:05,553 >> Deleting older checkpoint [checkpoint-100] due to args.save_total_limit |
|
|
|
19%|βββββββββββββββ | 203/1090 [03:37<45:45, 3.10s/it] |
|
|
|
|
|
|
|
19%|ββββββββββββββββ | 209/1090 [03:42<17:11, 1.17s/it] |
|
|
|
|
|
20%|ββββββββββββββββ | 214/1090 [03:47<14:00, 1.04it/s] |
|
|
|
|
|
20%|ββββββββββββββββ | 218/1090 [03:51<13:24, 1.08it/s] |
|
|
|
|
|
|
|
21%|βββββββββββββββββ | 225/1090 [03:57<13:10, 1.09it/s] |
|
|
|
|
|
21%|βββββββββββββββββ | 229/1090 [04:01<13:08, 1.09it/s] |
|
|
|
|
|
21%|ββββββββββββββββββ | 234/1090 [04:05<13:03, 1.09it/s] |
|
|
|
|
|
22%|ββββββββββββββββββ | 238/1090 [04:09<12:57, 1.10it/s] |
|
|
|
|
|
|
|
22%|ββββββββββββββββββ | 244/1090 [04:14<12:54, 1.09it/s] |
|
|
|
|
|
23%|βββββββββββββββββββ | 249/1090 [04:19<12:51, 1.09it/s] |
|
|
|
|
|
23%|βββββββββββββββββββ | 253/1090 [04:23<12:45, 1.09it/s] |
|
|
|
|
|
|
|
24%|βββββββββββββββββββ | 260/1090 [04:29<12:38, 1.09it/s] |
|
|
|
|
|
24%|ββββββββββββββββββββ | 264/1090 [04:33<12:37, 1.09it/s] |
|
|
|
|
|
25%|ββββββββββββββββββββ | 269/1090 [04:37<12:32, 1.09it/s] |
|
|
|
|
|
25%|ββββββββββββββββββββ | 273/1090 [04:41<12:28, 1.09it/s] |
|
|
|
|
|
|
|
26%|βββββββββββββββββββββ | 279/1090 [04:46<12:22, 1.09it/s] |
|
|
|
|
|
26%|βββββββββββββββββββββ | 284/1090 [04:51<12:15, 1.10it/s] |
|
|
|
|
|
26%|ββββββββββββββββββββββ | 288/1090 [04:55<12:13, 1.09it/s] |
|
|
|
|
|
|
|
27%|ββββββββββββββββββββββ | 295/1090 [05:01<12:08, 1.09it/s] |
|
|
|
|
|
27%|ββββββββββββββββββββββ | 299/1090 [05:05<12:03, 1.09it/s] |
|
28%|ββββββββββββββββββββββ | 300/1090 [05:06<12:03, 1.09it/s][INFO|trainer.py:2889] 2024-02-01 17:43:41,617 >> Saving model checkpoint to ./tmp-checkpoint-300 |
|
[INFO|configuration_utils.py:483] 2024-02-01 17:43:41,621 >> Configuration saved in ./tmp-checkpoint-300/config.json |
|
[INFO|configuration_utils.py:594] 2024-02-01 17:43:41,623 >> Configuration saved in ./tmp-checkpoint-300/generation_config.json |
|
[2024-02-01 17:43:44,742] [INFO] [logging.py:96:log_dist] [Rank 0] [Torch] Checkpoint global_step300 is about to be saved! |
|
[2024-02-01 17:43:44,746] [INFO] [logging.py:96:log_dist] [Rank 0] Saving model checkpoint: ./tmp-checkpoint-300/global_step300/zero_pp_rank_0_mp_rank_00_model_states.pt |
|
[2024-02-01 17:43:44,747] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving ./tmp-checkpoint-300/global_step300/zero_pp_rank_0_mp_rank_00_model_states.pt... |
|
[2024-02-01 17:43:44,751] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved ./tmp-checkpoint-300/global_step300/zero_pp_rank_0_mp_rank_00_model_states.pt. |
|
[2024-02-01 17:43:44,759] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving ./tmp-checkpoint-300/global_step300/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt... |
|
[INFO|modeling_utils.py:2382] 2024-02-01 17:43:44,712 >> Model weights saved in ./tmp-checkpoint-300/pytorch_model.bin |
|
[INFO|tokenization_utils_base.py:2432] 2024-02-01 17:43:44,715 >> tokenizer config file saved in ./tmp-checkpoint-300/tokenizer_config.json |
|
[INFO|tokenization_utils_base.py:2441] 2024-02-01 17:43:44,717 >> Special tokens file saved in ./tmp-checkpoint-300/special_tokens_map.json |
|
/fsx/sanchit/miniconda3/envs/venv/lib/python3.11/site-packages/torch/nn/modules/module.py:1879: UserWarning: Positional args are being deprecated, use kwargs instead. Refer to https://pytorch.org/docs/master/generated/torch.nn.Module.html#torch.nn.Module.state_dict for details. |
|
warnings.warn( |
|
[2024-02-01 17:43:48,812] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved ./tmp-checkpoint-300/global_step300/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt. |
|
[2024-02-01 17:43:48,816] [INFO] [engine.py:3393:_save_zero_checkpoint] zero checkpoint saved ./tmp-checkpoint-300/global_step300/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt |
|
[2024-02-01 17:43:48,817] [INFO] [torch_checkpoint_engine.py:33:commit] [Torch] Checkpoint global_step300 is ready now! |
|
[INFO|tokenization_utils_base.py:2432] 2024-02-01 17:43:51,631 >> tokenizer config file saved in ./tokenizer_config.json |
|
[INFO|tokenization_utils_base.py:2441] 2024-02-01 17:43:51,633 >> Special tokens file saved in ./special_tokens_map.json |
|
[INFO|trainer.py:2979] 2024-02-01 17:43:51,660 >> Deleting older checkpoint [checkpoint-200] due to args.save_total_limit |
|
|
|
|
|
28%|βββββββββββββββββββββββ | 305/1090 [05:25<25:42, 1.96s/it] |
|
|
|
|
|
28%|βββββββββββββββββββββββ | 309/1090 [05:28<15:09, 1.16s/it] |
|
|
|
|
|
29%|βββββββββββββββββββββββ | 314/1090 [05:33<12:20, 1.05it/s] |
|
|
|
|
|
29%|ββββββββββββββββββββββββ | 318/1090 [05:37<11:52, 1.08it/s] |
|
|
|
|
|
|
|
30%|ββββββββββββββββββββββββ | 325/1090 [05:43<11:38, 1.09it/s] |
|
|
|
|
|
30%|βββββββββββββββββββββββββ | 329/1090 [05:47<11:34, 1.10it/s] |
|
|
|
|
|
31%|βββββββββββββββββββββββββ | 334/1090 [05:51<11:31, 1.09it/s] |
|
|
|
|
|
31%|βββββββββββββββββββββββββ | 338/1090 [05:55<11:26, 1.09it/s] |
|
|
|
|
|
|
|
32%|ββββββββββββββββββββββββββ | 344/1090 [06:00<11:22, 1.09it/s] |
|
|
|
|
|
32%|ββββββββββββββββββββββββββ | 349/1090 [06:05<11:17, 1.09it/s] |
|
|
|
|
|
32%|ββββββββββββββββββββββββββ | 353/1090 [06:09<11:15, 1.09it/s] |
|
|
|
|
|
|
|
33%|βββββββββββββββββββββββββββ | 360/1090 [06:15<11:07, 1.09it/s] |
|
|
|
|
|
33%|βββββββββββββββββββββββββββ | 364/1090 [06:19<11:02, 1.10it/s] |
|
|
|
|
|
34%|βββββββββββββββββββββββββββ | 369/1090 [06:23<10:58, 1.10it/s] |
|
|
|
|
|
34%|ββββββββββββββββββββββββββββ | 373/1090 [06:27<10:54, 1.10it/s] |
|
|
|
|
|
|
|
35%|ββββββββββββββββββββββββββββ | 380/1090 [06:33<10:48, 1.09it/s] |
|
|
|
|
|
35%|βββββββββββββββββββββββββββββ | 384/1090 [06:37<10:45, 1.09it/s] |
|
|
|
|
|
36%|βββββββββββββββββββββββββββββ | 388/1090 [06:41<10:41, 1.09it/s] |
|
|
|
|
|
|
|
36%|βββββββββββββββββββββββββββββ | 395/1090 [06:47<10:35, 1.09it/s] |
|
|
|
|
|
37%|ββββββββββββββββββββββββββββββ | 399/1090 [06:51<11:32, 1.00s/it] |
|
37%|ββββββββββββββββββββββββββββββ | 400/1090 [06:52<11:13, 1.02it/s][INFO|trainer.py:2889] 2024-02-01 17:45:27,895 >> Saving model checkpoint to ./tmp-checkpoint-400 |
|
[INFO|configuration_utils.py:483] 2024-02-01 17:45:27,898 >> Configuration saved in ./tmp-checkpoint-400/config.json |
|
[INFO|configuration_utils.py:594] 2024-02-01 17:45:27,900 >> Configuration saved in ./tmp-checkpoint-400/generation_config.json |
|
[2024-02-01 17:45:31,010] [INFO] [logging.py:96:log_dist] [Rank 0] [Torch] Checkpoint global_step400 is about to be saved! |
|
[2024-02-01 17:45:31,014] [INFO] [logging.py:96:log_dist] [Rank 0] Saving model checkpoint: ./tmp-checkpoint-400/global_step400/zero_pp_rank_0_mp_rank_00_model_states.pt |
|
[2024-02-01 17:45:31,014] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving ./tmp-checkpoint-400/global_step400/zero_pp_rank_0_mp_rank_00_model_states.pt... |
|
[2024-02-01 17:45:31,018] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved ./tmp-checkpoint-400/global_step400/zero_pp_rank_0_mp_rank_00_model_states.pt. |
|
[2024-02-01 17:45:31,026] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving ./tmp-checkpoint-400/global_step400/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt... |
|
[INFO|modeling_utils.py:2382] 2024-02-01 17:45:30,979 >> Model weights saved in ./tmp-checkpoint-400/pytorch_model.bin |
|
[INFO|tokenization_utils_base.py:2432] 2024-02-01 17:45:30,982 >> tokenizer config file saved in ./tmp-checkpoint-400/tokenizer_config.json |
|
[INFO|tokenization_utils_base.py:2441] 2024-02-01 17:45:30,984 >> Special tokens file saved in ./tmp-checkpoint-400/special_tokens_map.json |
|
/fsx/sanchit/miniconda3/envs/venv/lib/python3.11/site-packages/torch/nn/modules/module.py:1879: UserWarning: Positional args are being deprecated, use kwargs instead. Refer to https://pytorch.org/docs/master/generated/torch.nn.Module.html#torch.nn.Module.state_dict for details. |
|
warnings.warn( |
|
[2024-02-01 17:45:35,048] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved ./tmp-checkpoint-400/global_step400/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt. |
|
[2024-02-01 17:45:35,053] [INFO] [engine.py:3393:_save_zero_checkpoint] zero checkpoint saved ./tmp-checkpoint-400/global_step400/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt |
|
[2024-02-01 17:45:35,055] [INFO] [torch_checkpoint_engine.py:33:commit] [Torch] Checkpoint global_step400 is ready now! |
|
[INFO|tokenization_utils_base.py:2432] 2024-02-01 17:45:37,892 >> tokenizer config file saved in ./tokenizer_config.json |
|
[INFO|tokenization_utils_base.py:2441] 2024-02-01 17:45:37,895 >> Special tokens file saved in ./special_tokens_map.json |
|
[INFO|trainer.py:2979] 2024-02-01 17:45:37,923 >> Deleting older checkpoint [checkpoint-300] due to args.save_total_limit |
|
|
|
37%|ββββββββββββββββββββββββββββββ | 405/1090 [07:11<22:47, 2.00s/it] |
|
|
|
|
|
38%|ββββββββββββββββββββββββββββββ | 409/1090 [07:15<13:18, 1.17s/it] |
|
|
|
|
|
38%|βββββββββββββββββββββββββββββββ | 413/1090 [07:19<11:00, 1.02it/s] |
|
|
|
|
|
|
|
39%|βββββββββββββββββββββββββββββββ | 420/1090 [07:25<10:15, 1.09it/s] |
|
|
|
|
|
39%|βββββββββββββββββββββββββββββββ | 424/1090 [07:29<10:09, 1.09it/s] |
|
|
|
|
|
39%|ββββββββββββββββββββββββββββββββ | 429/1090 [07:33<10:05, 1.09it/s] |
|
|
|
|
|
40%|ββββββββββββββββββββββββββββββββ | 433/1090 [07:37<10:01, 1.09it/s] |
|
|
|
|
|
|
|
40%|βββββββββββββββββββββββββββββββββ | 440/1090 [07:43<09:54, 1.09it/s] |
|
|
|
|
|
41%|βββββββββββββββββββββββββββββββββ | 444/1090 [07:47<09:50, 1.09it/s] |
|
|
|
|
|
41%|βββββββββββββββββββββββββββββββββ | 448/1090 [07:51<09:46, 1.09it/s] |
|
|
|
|
|
|
|
42%|ββββββββββββββββββββββββββββββββββ | 455/1090 [07:57<09:39, 1.10it/s] |
|
|
|
|
|
42%|ββββββββββββββββββββββββββββββββββ | 459/1090 [08:01<09:36, 1.09it/s] |
|
|
|
|
|
43%|ββββββββββββββββββββββββββββββββββ | 464/1090 [08:05<09:31, 1.09it/s] |
|
|
|
|
|
43%|βββββββββββββββββββββββββββββββββββ | 468/1090 [08:09<09:29, 1.09it/s] |
|
|
|
|
|
|
|
43%|βββββββββββββββββββββββββββββββββββ | 474/1090 [08:14<09:22, 1.09it/s] |
|
|
|
|
|
44%|ββββββββββββββββββββββββββββββββββββ | 479/1090 [08:19<09:18, 1.09it/s] |
|
|
|
|
|
44%|ββββββββββββββββββββββββββββββββββββ | 483/1090 [08:23<09:14, 1.09it/s] |
|
|
|
|
|
|
|
45%|ββββββββββββββββββββββββββββββββββββ | 490/1090 [08:29<09:09, 1.09it/s] |
|
|
|
|
|
45%|βββββββββββββββββββββββββββββββββββββ | 494/1090 [08:33<09:05, 1.09it/s] |
|
|
|
|
|
46%|βββββββββββββββββββββββββββββββββββββ | 499/1090 [08:37<09:00, 1.09it/s] |
|
46%|βββββββββββββββββββββββββββββββββββββ | 500/1090 [08:38<08:59, 1.09it/s][INFO|trainer.py:2889] 2024-02-01 17:47:14,118 >> Saving model checkpoint to ./tmp-checkpoint-500 |
|
[INFO|configuration_utils.py:483] 2024-02-01 17:47:14,121 >> Configuration saved in ./tmp-checkpoint-500/config.json |
|
[INFO|configuration_utils.py:594] 2024-02-01 17:47:14,124 >> Configuration saved in ./tmp-checkpoint-500/generation_config.json |
|
[2024-02-01 17:47:17,242] [INFO] [logging.py:96:log_dist] [Rank 0] [Torch] Checkpoint global_step500 is about to be saved! |
|
[2024-02-01 17:47:17,247] [INFO] [logging.py:96:log_dist] [Rank 0] Saving model checkpoint: ./tmp-checkpoint-500/global_step500/zero_pp_rank_0_mp_rank_00_model_states.pt |
|
[2024-02-01 17:47:17,247] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving ./tmp-checkpoint-500/global_step500/zero_pp_rank_0_mp_rank_00_model_states.pt... |
|
[2024-02-01 17:47:17,368] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved ./tmp-checkpoint-500/global_step500/zero_pp_rank_0_mp_rank_00_model_states.pt. |
|
[2024-02-01 17:47:17,373] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving ./tmp-checkpoint-500/global_step500/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt... |
|
[INFO|modeling_utils.py:2382] 2024-02-01 17:47:17,211 >> Model weights saved in ./tmp-checkpoint-500/pytorch_model.bin |
|
[INFO|tokenization_utils_base.py:2432] 2024-02-01 17:47:17,214 >> tokenizer config file saved in ./tmp-checkpoint-500/tokenizer_config.json |
|
[INFO|tokenization_utils_base.py:2441] 2024-02-01 17:47:17,216 >> Special tokens file saved in ./tmp-checkpoint-500/special_tokens_map.json |
|
/fsx/sanchit/miniconda3/envs/venv/lib/python3.11/site-packages/torch/nn/modules/module.py:1879: UserWarning: Positional args are being deprecated, use kwargs instead. Refer to https://pytorch.org/docs/master/generated/torch.nn.Module.html#torch.nn.Module.state_dict for details. |
|
warnings.warn( |
|
[2024-02-01 17:47:20,828] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved ./tmp-checkpoint-500/global_step500/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt. |
|
[2024-02-01 17:47:20,833] [INFO] [engine.py:3393:_save_zero_checkpoint] zero checkpoint saved ./tmp-checkpoint-500/global_step500/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt |
|
[2024-02-01 17:47:21,476] [INFO] [torch_checkpoint_engine.py:33:commit] [Torch] Checkpoint global_step500 is ready now! |
|
[INFO|tokenization_utils_base.py:2432] 2024-02-01 17:47:24,306 >> tokenizer config file saved in ./tokenizer_config.json |
|
[INFO|tokenization_utils_base.py:2441] 2024-02-01 17:47:24,308 >> Special tokens file saved in ./special_tokens_map.json |
|
[INFO|trainer.py:2979] 2024-02-01 17:47:24,337 >> Deleting older checkpoint [checkpoint-400] due to args.save_total_limit |
|
|
|
46%|βββββββββββββββββββββββββββββββββββββ | 504/1090 [08:56<23:43, 2.43s/it] |
|
|
|
|
|
47%|ββββββββββββββββββββββββββββββββββββββ | 509/1090 [09:01<11:17, 1.17s/it] |
|
|
|
|
|
47%|ββββββββββββββββββββββββββββββββββββββ | 513/1090 [09:05<09:20, 1.03it/s] |
|
|
|
|
|
|
|
48%|βββββββββββββββββββββββββββββββββββββββ | 520/1090 [09:11<08:43, 1.09it/s] |
|
|
|
|
|
48%|βββββββββββββββββββββββββββββββββββββββ | 524/1090 [09:15<08:38, 1.09it/s] |
|
|
|
|
|
49%|βββββββββββββββββββββββββββββββββββββββ | 529/1090 [09:19<08:35, 1.09it/s] |
|
|
|
|
|
|
|
49%|ββββββββββββββββββββββββββββββββββββββββ | 535/1090 [09:25<08:29, 1.09it/s] |
|
|
|
|
|
49%|ββββββββββββββββββββββββββββββββββββββββ | 539/1090 [09:28<08:24, 1.09it/s] |
|
|
|
|
|
50%|ββββββββββββββββββββββββββββββββββββββββ | 544/1090 [09:33<08:19, 1.09it/s] |
|
|
|
|
|
50%|βββββββββββββββββββββββββββββββββββββββββ | 548/1090 [09:37<08:15, 1.09it/s] |
|
|
|
|
|
|
|
51%|βββββββββββββββββββββββββββββββββββββββββ | 555/1090 [09:43<08:09, 1.09it/s] |
|
|
|
|
|
51%|βββββββββββββββββββββββββββββββββββββββββ | 559/1090 [09:47<08:10, 1.08it/s] |
|
|
|
|
|
52%|ββββββββββββββββββββββββββββββββββββββββββ | 564/1090 [09:51<08:01, 1.09it/s] |
|
|
|
|
|
|
|
52%|ββββββββββββββββββββββββββββββββββββββββββ | 569/1090 [09:56<07:57, 1.09it/s] |
|
|
|
|
|
|
|
53%|βββββββββββββββββββββββββββββββββββββββββββ | 575/1090 [10:03<08:41, 1.01s/it] |
|
|
|
|
|
53%|βββββββββββββββββββββββββββββββββββββββββββ | 579/1090 [10:07<08:16, 1.03it/s] |
|
|
|
|
|
53%|βββββββββββββββββββββββββββββββββββββββββββ | 583/1090 [10:11<08:05, 1.04it/s] |
|
|
|
|
|
|
|
54%|ββββββββββββββββββββββββββββββββββββββββββββ | 590/1090 [10:17<07:52, 1.06it/s] |
|
|
|
|
|
54%|ββββββββββββββββββββββββββββββββββββββββββββ | 594/1090 [10:21<07:50, 1.06it/s] |
|
|
|
|
|
55%|ββββββββββββββββββββββββββββββββββββββββββββ | 598/1090 [10:25<07:43, 1.06it/s] |
|
|
|
55%|ββββββββββββββββββββββββββββββββββββββββββββ | 600/1090 [10:27<07:42, 1.06it/s][INFO|trainer.py:2889] 2024-02-01 17:49:02,702 >> Saving model checkpoint to ./tmp-checkpoint-600 |
|
[INFO|configuration_utils.py:483] 2024-02-01 17:49:02,706 >> Configuration saved in ./tmp-checkpoint-600/config.json |
|
[INFO|configuration_utils.py:594] 2024-02-01 17:49:02,708 >> Configuration saved in ./tmp-checkpoint-600/generation_config.json |
|
[INFO|modeling_utils.py:2382] 2024-02-01 17:49:05,978 >> Model weights saved in ./tmp-checkpoint-600/pytorch_model.bin |
|
[INFO|tokenization_utils_base.py:2432] 2024-02-01 17:49:05,993 >> tokenizer config file saved in ./tmp-checkpoint-600/tokenizer_config.json |
|
[INFO|tokenization_utils_base.py:2441] 2024-02-01 17:49:05,996 >> Special tokens file saved in ./tmp-checkpoint-600/special_tokens_map.json |
|
/fsx/sanchit/miniconda3/envs/venv/lib/python3.11/site-packages/torch/nn/modules/module.py:1879: UserWarning: Positional args are being deprecated, use kwargs instead. Refer to https://pytorch.org/docs/master/generated/torch.nn.Module.html#torch.nn.Module.state_dict for details. |
|
warnings.warn( |
|
[2024-02-01 17:49:06,062] [INFO] [logging.py:96:log_dist] [Rank 0] [Torch] Checkpoint global_step600 is about to be saved! |
|
[2024-02-01 17:49:06,071] [INFO] [logging.py:96:log_dist] [Rank 0] Saving model checkpoint: ./tmp-checkpoint-600/global_step600/zero_pp_rank_0_mp_rank_00_model_states.pt |
|
[2024-02-01 17:49:06,071] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving ./tmp-checkpoint-600/global_step600/zero_pp_rank_0_mp_rank_00_model_states.pt... |
|
[2024-02-01 17:49:06,208] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved ./tmp-checkpoint-600/global_step600/zero_pp_rank_0_mp_rank_00_model_states.pt. |
|
[2024-02-01 17:49:06,213] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving ./tmp-checkpoint-600/global_step600/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt... |
|
[2024-02-01 17:49:10,329] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved ./tmp-checkpoint-600/global_step600/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt. |
|
[2024-02-01 17:49:10,334] [INFO] [engine.py:3393:_save_zero_checkpoint] zero checkpoint saved ./tmp-checkpoint-600/global_step600/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt |
|
[2024-02-01 17:49:10,335] [INFO] [torch_checkpoint_engine.py:33:commit] [Torch] Checkpoint global_step600 is ready now! |
|
[INFO|tokenization_utils_base.py:2432] 2024-02-01 17:49:13,152 >> tokenizer config file saved in ./tokenizer_config.json |
|
[INFO|tokenization_utils_base.py:2441] 2024-02-01 17:49:13,155 >> Special tokens file saved in ./special_tokens_map.json |
|
[INFO|trainer.py:2979] 2024-02-01 17:49:13,184 >> Deleting older checkpoint [checkpoint-500] due to args.save_total_limit |
|
|
|
55%|βββββββββββββββββββββββββββββββββββββββββββββ | 603/1090 [10:45<25:52, 3.19s/it] |
|
|
|
|
|
|
|
56%|βββββββββββββββββββββββββββββββββββββββββββββ | 610/1090 [10:51<08:57, 1.12s/it] |
|
|
|
|
|
56%|βββββββββββββββββββββββββββββββββββββββββββββ | 614/1090 [10:55<07:45, 1.02it/s] |
|
|
|
|
|
57%|ββββββββββββββββββββββββββββββββββββββββββββββ | 618/1090 [10:59<07:25, 1.06it/s] |
|
|
|
|
|
|
|
57%|ββββββββββββββββββββββββββββββββββββββββββββββ | 625/1090 [11:05<07:14, 1.07it/s] |
|
|
|
|
|
58%|βββββββββββββββββββββββββββββββββββββββββββββββ | 629/1090 [11:09<07:09, 1.07it/s] |
|
|
|
|
|
58%|βββββββββββββββββββββββββββββββββββββββββββββββ | 633/1090 [11:13<07:05, 1.07it/s] |
|
|
|
|
|
|
|
59%|βββββββββββββββββββββββββββββββββββββββββββββββ | 640/1090 [11:19<06:59, 1.07it/s] |
|
|
|
|
|
59%|ββββββββββββββββββββββββββββββββββββββββββββββββ | 644/1090 [11:23<06:55, 1.07it/s] |
|
|
|
|
|
59%|ββββββββββββββββββββββββββββββββββββββββββββββββ | 648/1090 [11:27<06:52, 1.07it/s] |
|
|
|
|
|
|
|
60%|ββββββββββββββββββββββββββββββββββββββββββββββββ | 655/1090 [11:33<06:45, 1.07it/s] |
|
|
|
|
|
60%|βββββββββββββββββββββββββββββββββββββββββββββββββ | 659/1090 [11:37<06:41, 1.07it/s] |
|
|
|
|
|
61%|βββββββββββββββββββββββββββββββββββββββββββββββββ | 663/1090 [11:41<06:38, 1.07it/s] |
|
|
|
|
|
|
|
61%|ββββββββββββββββββββββββββββββββββββββββββββββββββ | 670/1090 [11:47<06:30, 1.08it/s] |
|
|
|
|
|
62%|ββββββββββββββββββββββββββββββββββββββββββββββββββ | 674/1090 [11:51<06:25, 1.08it/s] |
|
|
|
|
|
62%|ββββββββββββββββββββββββββββββββββββββββββββββββββ | 678/1090 [11:55<06:22, 1.08it/s] |
|
|
|
|
|
|
|
63%|βββββββββββββββββββββββββββββββββββββββββββββββββββ | 684/1090 [12:01<06:36, 1.02it/s] |
|
|
|
|
|
63%|βββββββββββββββββββββββββββββββββββββββββββββββββββ | 689/1090 [12:05<06:16, 1.07it/s] |
|
|
|
|
|
|
|
64%|βββββββββββββββββββββββββββββββββββββββββββββββββββ | 695/1090 [12:11<06:07, 1.08it/s] |
|
|
|
|
|
64%|ββββββββββββββββββββββββββββββββββββββββββββββββββββ | 699/1090 [12:15<06:03, 1.08it/s] |
|
64%|ββββββββββββββββββββββββββββββββββββββββββββββββββββ | 700/1090 [12:16<06:02, 1.08it/s][INFO|trainer.py:2889] 2024-02-01 17:50:51,588 >> Saving model checkpoint to ./tmp-checkpoint-700 |
|
[INFO|configuration_utils.py:483] 2024-02-01 17:50:51,591 >> Configuration saved in ./tmp-checkpoint-700/config.json |
|
[INFO|configuration_utils.py:594] 2024-02-01 17:50:51,593 >> Configuration saved in ./tmp-checkpoint-700/generation_config.json |
|
[2024-02-01 17:50:54,656] [INFO] [logging.py:96:log_dist] [Rank 0] [Torch] Checkpoint global_step700 is about to be saved! |
|
[2024-02-01 17:50:54,660] [INFO] [logging.py:96:log_dist] [Rank 0] Saving model checkpoint: ./tmp-checkpoint-700/global_step700/zero_pp_rank_0_mp_rank_00_model_states.pt |
|
[2024-02-01 17:50:54,660] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving ./tmp-checkpoint-700/global_step700/zero_pp_rank_0_mp_rank_00_model_states.pt... |
|
[2024-02-01 17:50:54,665] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved ./tmp-checkpoint-700/global_step700/zero_pp_rank_0_mp_rank_00_model_states.pt. |
|
[2024-02-01 17:50:54,672] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving ./tmp-checkpoint-700/global_step700/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt... |
|
[INFO|modeling_utils.py:2382] 2024-02-01 17:50:54,621 >> Model weights saved in ./tmp-checkpoint-700/pytorch_model.bin |
|
[INFO|tokenization_utils_base.py:2432] 2024-02-01 17:50:54,624 >> tokenizer config file saved in ./tmp-checkpoint-700/tokenizer_config.json |
|
[INFO|tokenization_utils_base.py:2441] 2024-02-01 17:50:54,626 >> Special tokens file saved in ./tmp-checkpoint-700/special_tokens_map.json |
|
/fsx/sanchit/miniconda3/envs/venv/lib/python3.11/site-packages/torch/nn/modules/module.py:1879: UserWarning: Positional args are being deprecated, use kwargs instead. Refer to https://pytorch.org/docs/master/generated/torch.nn.Module.html#torch.nn.Module.state_dict for details. |
|
warnings.warn( |
|
[2024-02-01 17:50:58,287] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved ./tmp-checkpoint-700/global_step700/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt. |
|
[2024-02-01 17:50:58,292] [INFO] [engine.py:3393:_save_zero_checkpoint] zero checkpoint saved ./tmp-checkpoint-700/global_step700/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt |
|
[2024-02-01 17:50:58,823] [INFO] [torch_checkpoint_engine.py:33:commit] [Torch] Checkpoint global_step700 is ready now! |
|
[INFO|tokenization_utils_base.py:2432] 2024-02-01 17:51:01,627 >> tokenizer config file saved in ./tokenizer_config.json |
|
[INFO|tokenization_utils_base.py:2441] 2024-02-01 17:51:01,629 >> Special tokens file saved in ./special_tokens_map.json |
|
[INFO|trainer.py:2979] 2024-02-01 17:51:01,659 >> Deleting older checkpoint [checkpoint-600] due to args.save_total_limit |
|
|
|
|
|
65%|ββββββββββββββββββββββββββββββββββββββββββββββββββββ | 705/1090 [12:35<12:45, 1.99s/it] |
|
|
|
|
|
65%|ββββββββββββββββββββββββββββββββββββββββββββββββββββ | 709/1090 [12:39<07:30, 1.18s/it] |
|
|
|
|
|
66%|βββββββββββββββββββββββββββββββββββββββββββββββββββββ | 714/1090 [12:43<06:04, 1.03it/s] |
|
|
|
|
|
|
|
66%|βββββββββββββββββββββββββββββββββββββββββββββββββββββ | 720/1090 [12:49<05:45, 1.07it/s] |
|
|
|
|
|
67%|ββββββββββββββββββββββββββββββββββββββββββββββββββββββ | 725/1090 [12:53<05:39, 1.08it/s] |
|
|
|
|
|
67%|ββββββββββββββββββββββββββββββββββββββββββββββββββββββ | 729/1090 [12:57<05:35, 1.08it/s] |
|
|
|
|
|
67%|ββββββββββββββββββββββββββββββββββββββββββββββββββββββ | 733/1090 [13:01<05:30, 1.08it/s] |
|
|
|
|
|
|
|
68%|βββββββββββββββββββββββββββββββββββββββββββββββββββββββ | 740/1090 [13:07<05:24, 1.08it/s] |
|
|
|
|
|
68%|βββββββββββββββββββββββββββββββββββββββββββββββββββββββ | 744/1090 [13:11<05:21, 1.07it/s] |
|
|
|
|
|
|
|
69%|βββββββββββββββββββββββββββββββββββββββββββββββββββββββ | 750/1090 [13:17<05:34, 1.02it/s] |
|
|
|
|
|
69%|ββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | 754/1090 [13:21<05:17, 1.06it/s] |
|
|
|
|
|
70%|ββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | 759/1090 [13:25<05:08, 1.07it/s] |
|
|
|
|
|
|
|
70%|βββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | 765/1090 [13:31<05:02, 1.08it/s] |
|
|
|
|
|
71%|βββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | 769/1090 [13:35<04:58, 1.08it/s] |
|
|
|
|
|
71%|βββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | 774/1090 [13:39<04:53, 1.08it/s] |
|
|
|
|
|
|
|
72%|ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | 780/1090 [13:45<04:47, 1.08it/s] |
|
|
|
|
|
72%|ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | 785/1090 [13:50<04:43, 1.08it/s] |
|
|
|
|
|
72%|ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | 789/1090 [13:53<04:38, 1.08it/s] |
|
|
|
|
|
73%|βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | 793/1090 [13:57<04:34, 1.08it/s] |
|
|
|
|
|
|
|
73%|βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | 800/1090 [14:03<04:28, 1.08it/s] |
|
73%|βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | 800/1090 [14:03<04:28, 1.08it/s][INFO|trainer.py:2889] 2024-02-01 17:52:39,448 >> Saving model checkpoint to ./tmp-checkpoint-800 |
|
[INFO|configuration_utils.py:483] 2024-02-01 17:52:39,452 >> Configuration saved in ./tmp-checkpoint-800/config.json |
|
[INFO|configuration_utils.py:594] 2024-02-01 17:52:39,454 >> Configuration saved in ./tmp-checkpoint-800/generation_config.json |
|
[2024-02-01 17:52:42,689] [INFO] [logging.py:96:log_dist] [Rank 0] [Torch] Checkpoint global_step800 is about to be saved! |
|
[2024-02-01 17:52:42,693] [INFO] [logging.py:96:log_dist] [Rank 0] Saving model checkpoint: ./tmp-checkpoint-800/global_step800/zero_pp_rank_0_mp_rank_00_model_states.pt |
|
[2024-02-01 17:52:42,693] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving ./tmp-checkpoint-800/global_step800/zero_pp_rank_0_mp_rank_00_model_states.pt... |
|
[2024-02-01 17:52:42,880] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved ./tmp-checkpoint-800/global_step800/zero_pp_rank_0_mp_rank_00_model_states.pt. |
|
[2024-02-01 17:52:42,884] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving ./tmp-checkpoint-800/global_step800/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt... |
|
[INFO|modeling_utils.py:2382] 2024-02-01 17:52:42,659 >> Model weights saved in ./tmp-checkpoint-800/pytorch_model.bin |
|
[INFO|tokenization_utils_base.py:2432] 2024-02-01 17:52:42,662 >> tokenizer config file saved in ./tmp-checkpoint-800/tokenizer_config.json |
|
[INFO|tokenization_utils_base.py:2441] 2024-02-01 17:52:42,664 >> Special tokens file saved in ./tmp-checkpoint-800/special_tokens_map.json |
|
/fsx/sanchit/miniconda3/envs/venv/lib/python3.11/site-packages/torch/nn/modules/module.py:1879: UserWarning: Positional args are being deprecated, use kwargs instead. Refer to https://pytorch.org/docs/master/generated/torch.nn.Module.html#torch.nn.Module.state_dict for details. |
|
warnings.warn( |
|
[2024-02-01 17:52:46,824] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved ./tmp-checkpoint-800/global_step800/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt. |
|
[2024-02-01 17:52:46,829] [INFO] [engine.py:3393:_save_zero_checkpoint] zero checkpoint saved ./tmp-checkpoint-800/global_step800/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt |
|
[2024-02-01 17:52:47,053] [INFO] [torch_checkpoint_engine.py:33:commit] [Torch] Checkpoint global_step800 is ready now! |
|
[INFO|tokenization_utils_base.py:2432] 2024-02-01 17:52:50,048 >> tokenizer config file saved in ./tokenizer_config.json |
|
[INFO|tokenization_utils_base.py:2441] 2024-02-01 17:52:50,049 >> Special tokens file saved in ./special_tokens_map.json |
|
[INFO|trainer.py:2979] 2024-02-01 17:52:50,078 >> Deleting older checkpoint [checkpoint-700] due to args.save_total_limit |
|
|
|
|
|
74%|βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | 805/1090 [14:23<09:36, 2.02s/it] |
|
|
|
|
|
74%|ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | 809/1090 [14:27<05:35, 1.19s/it] |
|
|
|
|
|
75%|ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | 813/1090 [14:31<04:34, 1.01it/s] |
|
|
|
|
|
|
|
75%|βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | 820/1090 [14:37<04:12, 1.07it/s] |
|
|
|
|
|
76%|βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | 824/1090 [14:41<04:08, 1.07it/s] |
|
|
|
|
|
76%|βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | 829/1090 [14:46<04:01, 1.08it/s] |
|
|
|
|
|
|
|
77%|ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | 835/1090 [14:51<03:56, 1.08it/s] |
|
|
|
|
|
77%|ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | 839/1090 [14:55<03:52, 1.08it/s] |
|
|
|
|
|
77%|ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | 844/1090 [14:59<03:49, 1.07it/s] |
|
|
|
|
|
|
|
78%|βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | 850/1090 [15:05<03:42, 1.08it/s] |
|
|
|
|
|
78%|βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | 854/1090 [15:09<03:39, 1.08it/s] |
|
|
|
|
|
79%|βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | 859/1090 [15:13<03:34, 1.08it/s] |
|
|
|
|
|
|
|
79%|ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | 865/1090 [15:19<03:28, 1.08it/s] |
|
|
|
|
|
80%|ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | 870/1090 [15:24<03:23, 1.08it/s] |
|
|
|
|
|
80%|βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | 874/1090 [15:27<03:20, 1.08it/s] |
|
|
|
|
|
|
|
81%|βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | 880/1090 [15:33<03:14, 1.08it/s] |
|
|
|
|
|
81%|βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | 885/1090 [15:37<03:09, 1.08it/s] |
|
|
|
|
|
82%|ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | 889/1090 [15:41<03:05, 1.08it/s] |
|
|
|
|
|
|
|
82%|ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | 895/1090 [15:47<03:16, 1.01s/it] |
|
|
|
|
|
82%|ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | 899/1090 [15:51<03:00, 1.06it/s] |
|
83%|ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | 900/1090 [15:52<02:58, 1.06it/s][INFO|trainer.py:2889] 2024-02-01 17:54:27,664 >> Saving model checkpoint to ./tmp-checkpoint-900 |
|
[INFO|configuration_utils.py:483] 2024-02-01 17:54:27,667 >> Configuration saved in ./tmp-checkpoint-900/config.json |
|
[INFO|configuration_utils.py:594] 2024-02-01 17:54:27,670 >> Configuration saved in ./tmp-checkpoint-900/generation_config.json |
|
[2024-02-01 17:54:30,775] [INFO] [logging.py:96:log_dist] [Rank 0] [Torch] Checkpoint global_step900 is about to be saved! |
|
[2024-02-01 17:54:30,780] [INFO] [logging.py:96:log_dist] [Rank 0] Saving model checkpoint: ./tmp-checkpoint-900/global_step900/zero_pp_rank_0_mp_rank_00_model_states.pt |
|
[2024-02-01 17:54:30,780] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving ./tmp-checkpoint-900/global_step900/zero_pp_rank_0_mp_rank_00_model_states.pt... |
|
[2024-02-01 17:54:30,784] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved ./tmp-checkpoint-900/global_step900/zero_pp_rank_0_mp_rank_00_model_states.pt. |
|
[2024-02-01 17:54:30,790] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving ./tmp-checkpoint-900/global_step900/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt... |
|
[INFO|modeling_utils.py:2382] 2024-02-01 17:54:30,743 >> Model weights saved in ./tmp-checkpoint-900/pytorch_model.bin |
|
[INFO|tokenization_utils_base.py:2432] 2024-02-01 17:54:30,746 >> tokenizer config file saved in ./tmp-checkpoint-900/tokenizer_config.json |
|
[INFO|tokenization_utils_base.py:2441] 2024-02-01 17:54:30,749 >> Special tokens file saved in ./tmp-checkpoint-900/special_tokens_map.json |
|
/fsx/sanchit/miniconda3/envs/venv/lib/python3.11/site-packages/torch/nn/modules/module.py:1879: UserWarning: Positional args are being deprecated, use kwargs instead. Refer to https://pytorch.org/docs/master/generated/torch.nn.Module.html#torch.nn.Module.state_dict for details. |
|
warnings.warn( |
|
[2024-02-01 17:54:34,521] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved ./tmp-checkpoint-900/global_step900/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt. |
|
[2024-02-01 17:54:34,525] [INFO] [engine.py:3393:_save_zero_checkpoint] zero checkpoint saved ./tmp-checkpoint-900/global_step900/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt |
|
[2024-02-01 17:54:34,855] [INFO] [torch_checkpoint_engine.py:33:commit] [Torch] Checkpoint global_step900 is ready now! |
|
[INFO|tokenization_utils_base.py:2432] 2024-02-01 17:54:37,837 >> tokenizer config file saved in ./tokenizer_config.json |
|
[INFO|tokenization_utils_base.py:2441] 2024-02-01 17:54:37,839 >> Special tokens file saved in ./special_tokens_map.json |
|
[INFO|trainer.py:2979] 2024-02-01 17:54:37,868 >> Deleting older checkpoint [checkpoint-800] due to args.save_total_limit |
|
|
|
|
|
83%|βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | 905/1090 [16:11<06:09, 2.00s/it] |
|
|
|
|
|
83%|βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | 909/1090 [16:15<03:33, 1.18s/it] |
|
|
|
|
|
84%|βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | 914/1090 [16:19<02:50, 1.03it/s] |
|
|
|
|
|
|
|
84%|ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | 920/1090 [16:25<02:37, 1.08it/s] |
|
|
|
|
|
85%|ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | 925/1090 [16:30<02:33, 1.08it/s] |
|
|
|
|
|
85%|βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | 929/1090 [16:33<02:28, 1.08it/s] |
|
|
|
|
|
|
|
86%|βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | 935/1090 [16:39<02:23, 1.08it/s] |
|
|
|
|
|
86%|βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | 940/1090 [16:43<02:18, 1.08it/s] |
|
|
|
|
|
87%|ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | 944/1090 [16:47<02:15, 1.08it/s] |
|
|
|
|
|
87%|ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | 948/1090 [16:51<02:12, 1.08it/s] |
|
|
|
|
|
|
|
88%|ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | 955/1090 [16:57<02:04, 1.08it/s] |
|
|
|
|
|
88%|βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | 959/1090 [17:01<02:01, 1.08it/s] |
|
|
|
|
|
88%|βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | 963/1090 [17:05<01:57, 1.08it/s] |
|
|
|
|
|
|
|
89%|ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | 970/1090 [17:11<01:51, 1.08it/s] |
|
|
|
|
|
89%|ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | 974/1090 [17:15<01:47, 1.08it/s] |
|
|
|
|
|
90%|ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | 979/1090 [17:20<01:42, 1.08it/s] |
|
|
|
|
|
|
|
90%|βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | 985/1090 [17:25<01:37, 1.08it/s] |
|
|
|
|
|
91%|βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | 989/1090 [17:29<01:33, 1.08it/s] |
|
|
|
|
|
91%|βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | 994/1090 [17:33<01:28, 1.08it/s] |
|
|
|
|
|
|
|
92%|βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | 1000/1090 [17:39<01:23, 1.08it/s] |
|
92%|βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | 1000/1090 [17:39<01:23, 1.08it/s][INFO|trainer.py:2889] 2024-02-01 17:56:15,101 >> Saving model checkpoint to ./tmp-checkpoint-1000 |
|
[INFO|configuration_utils.py:483] 2024-02-01 17:56:15,105 >> Configuration saved in ./tmp-checkpoint-1000/config.json |
|
[INFO|configuration_utils.py:594] 2024-02-01 17:56:15,107 >> Configuration saved in ./tmp-checkpoint-1000/generation_config.json |
|
[INFO|modeling_utils.py:2382] 2024-02-01 17:56:18,392 >> Model weights saved in ./tmp-checkpoint-1000/pytorch_model.bin |
|
[INFO|tokenization_utils_base.py:2432] 2024-02-01 17:56:18,395 >> tokenizer config file saved in ./tmp-checkpoint-1000/tokenizer_config.json |
|
[INFO|tokenization_utils_base.py:2441] 2024-02-01 17:56:18,397 >> Special tokens file saved in ./tmp-checkpoint-1000/special_tokens_map.json |
|
/fsx/sanchit/miniconda3/envs/venv/lib/python3.11/site-packages/torch/nn/modules/module.py:1879: UserWarning: Positional args are being deprecated, use kwargs instead. Refer to https://pytorch.org/docs/master/generated/torch.nn.Module.html#torch.nn.Module.state_dict for details. |
|
warnings.warn( |
|
[2024-02-01 17:56:18,423] [INFO] [logging.py:96:log_dist] [Rank 0] [Torch] Checkpoint global_step1000 is about to be saved! |
|
[2024-02-01 17:56:18,427] [INFO] [logging.py:96:log_dist] [Rank 0] Saving model checkpoint: ./tmp-checkpoint-1000/global_step1000/zero_pp_rank_0_mp_rank_00_model_states.pt |
|
[2024-02-01 17:56:18,427] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving ./tmp-checkpoint-1000/global_step1000/zero_pp_rank_0_mp_rank_00_model_states.pt... |
|
[2024-02-01 17:56:18,432] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved ./tmp-checkpoint-1000/global_step1000/zero_pp_rank_0_mp_rank_00_model_states.pt. |
|
[2024-02-01 17:56:18,439] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving ./tmp-checkpoint-1000/global_step1000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt... |
|
[2024-02-01 17:56:22,348] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved ./tmp-checkpoint-1000/global_step1000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt. |
|
[2024-02-01 17:56:22,353] [INFO] [engine.py:3393:_save_zero_checkpoint] zero checkpoint saved ./tmp-checkpoint-1000/global_step1000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt |
|
[2024-02-01 17:56:22,553] [INFO] [torch_checkpoint_engine.py:33:commit] [Torch] Checkpoint global_step1000 is ready now! |
|
[INFO|tokenization_utils_base.py:2432] 2024-02-01 17:56:25,394 >> tokenizer config file saved in ./tokenizer_config.json |
|
[INFO|tokenization_utils_base.py:2441] 2024-02-01 17:56:25,396 >> Special tokens file saved in ./special_tokens_map.json |
|
[INFO|trainer.py:2979] 2024-02-01 17:56:25,422 >> Deleting older checkpoint [checkpoint-900] due to args.save_total_limit |
|
|
|
92%|βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | 1003/1090 [17:57<04:31, 3.12s/it] |
|
|
|
|
|
|
|
93%|ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | 1010/1090 [18:03<01:28, 1.10s/it] |
|
|
|
|
|
93%|ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | 1014/1090 [18:07<01:13, 1.03it/s] |
|
|
|
|
|
93%|ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | 1019/1090 [18:12<01:06, 1.07it/s] |
|
|
|
|
|
|
|
94%|βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | 1025/1090 [18:17<01:00, 1.08it/s] |
|
|
|
|
|
94%|βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | 1029/1090 [18:21<00:56, 1.08it/s] |
|
|
|
|