|
slurm submission log: 2024-05-24 11:42:13.175719 |
|
created following sbatch script: |
|
|
|
############################### |
|
|
|
#!/bin/bash |
|
|
|
#SBATCH --account=nlp |
|
#SBATCH --cpus-per-task=16 |
|
#SBATCH --dependency=afterok:7648457 |
|
#SBATCH --gres=gpu:2 |
|
#SBATCH --job-name=tthrush-job-3227437 |
|
#SBATCH --mem=100G |
|
#SBATCH --nodelist=sphinx2 |
|
#SBATCH --open-mode=append |
|
#SBATCH --output=/juice5/scr5/tthrush/pretraining-coreset-selection/llm_pretraining/test_ordinal_constrained_initial_init_min_threshold/llms/pythia-70m_arc_easy_1/train_job_output.txt |
|
#SBATCH --partition=sphinx |
|
#SBATCH --time=14-0 |
|
|
|
# activate your desired anaconda environment |
|
. /nlp/scr/tthrush/miniconda3/envs/pretraining-coreset-selection/etc/profile.d/conda.sh ; conda activate pretraining-coreset-selection |
|
|
|
# cd to working directory |
|
cd . |
|
|
|
# launch commands |
|
srun --unbuffered run_as_child_processes 'torchrun --master_port 29527 --nproc_per_node=2 train_llm.py --dataset_id /juice5/scr5/tthrush/pretraining-coreset-selection/llm_pretraining/test_ordinal_constrained_initial_init_min_threshold/data/arc_easy --output_dir /juice5/scr5/tthrush/pretraining-coreset-selection/llm_pretraining/test_ordinal_constrained_initial_init_min_threshold/llms/pythia-70m_arc_easy_1 --output_hub_id pythia-70m_arc_easy --model_id EleutherAI/pythia-70m --learning_rate 1e-3 --warmup_ratio=0.1 --gradient_accumulation_steps 2 --per_device_train_batch_size 256 --seed 1 --num_train_epochs 14' |
|
|
|
############################### |
|
|
|
submission to slurm complete! |
|
|
|
|
|
############################### |
|
slurm submission output |
|
|
|
Submitted batch job 7648458 |
|
|
|
|
|
|
|
############################### |
|
|
|
/var/lib/slurm/slurmd/job7648458/slurm_script: line 16: /nlp/scr/tthrush/miniconda3/envs/pretraining-coreset-selection/etc/profile.d/conda.sh: No such file or directory |
|
|
|
CommandNotFoundError: Your shell has not been properly configured to use 'conda activate'. |
|
To initialize your shell, run |
|
|
|
$ conda init <SHELL_NAME> |
|
|
|
Currently supported shells are: |
|
- bash |
|
- fish |
|
- tcsh |
|
- xonsh |
|
- zsh |
|
- powershell |
|
|
|
See 'conda init --help' for more information and options. |
|
|
|
IMPORTANT: You may need to close and restart your shell after running 'conda init'. |
|
|
|
|
|
############################### |
|
start time: 2024-05-24 11:43:09.843114 |
|
machine: sphinx2 |
|
conda env: pretraining-coreset-selection |
|
############################### |
|
running following processes |
|
|
|
torchrun --master_port 29527 --nproc_per_node=2 train_llm.py --dataset_id /juice5/scr5/tthrush/pretraining-coreset-selection/llm_pretraining/test_ordinal_constrained_initial_init_min_threshold/data/arc_easy --output_dir /juice5/scr5/tthrush/pretraining-coreset-selection/llm_pretraining/test_ordinal_constrained_initial_init_min_threshold/llms/pythia-70m_arc_easy_1 --output_hub_id pythia-70m_arc_easy --model_id EleutherAI/pythia-70m --learning_rate 1e-3 --warmup_ratio=0.1 --gradient_accumulation_steps 2 --per_device_train_batch_size 256 --seed 1 --num_train_epochs 14 |
|
|
|
|
|
############################### |
|
command outputs: |
|
|
|
|
|
[2024-05-24 11:43:11,843] torch.distributed.run: [WARNING] |
|
[2024-05-24 11:43:11,843] torch.distributed.run: [WARNING] ***************************************** |
|
[2024-05-24 11:43:11,843] torch.distributed.run: [WARNING] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. |
|
[2024-05-24 11:43:11,843] torch.distributed.run: [WARNING] ***************************************** |
|
05/24/2024 11:43:15 - INFO - __main__ - Script parameters ScriptArguments(seed=1, dataset_id='/juice5/scr5/tthrush/pretraining-coreset-selection/llm_pretraining/test_ordinal_constrained_initial_init_min_threshold/data/arc_easy', output_dir='/juice5/scr5/tthrush/pretraining-coreset-selection/llm_pretraining/test_ordinal_constrained_initial_init_min_threshold/llms/pythia-70m_arc_easy_1', output_hub_id='pythia-70m_arc_easy', hf_hub_token=True, model_id='EleutherAI/pythia-70m', per_device_train_batch_size=256, num_train_epochs=14.0, learning_rate=0.001, gradient_accumulation_steps=2, from_scratch=True, warmup_ratio=0.1, adam_beta1=0.9, adam_beta2=0.95, adam_epsilon=1e-08, weight_decay=0.01, lr_scheduler_type='cosine', local_rank=0, resume_from_checkpoint=False, deepspeed=None, peft=False) |
|
05/24/2024 11:43:16 - INFO - __main__ - Script parameters ScriptArguments(seed=1, dataset_id='/juice5/scr5/tthrush/pretraining-coreset-selection/llm_pretraining/test_ordinal_constrained_initial_init_min_threshold/data/arc_easy', output_dir='/juice5/scr5/tthrush/pretraining-coreset-selection/llm_pretraining/test_ordinal_constrained_initial_init_min_threshold/llms/pythia-70m_arc_easy_1', output_hub_id='pythia-70m_arc_easy', hf_hub_token=True, model_id='EleutherAI/pythia-70m', per_device_train_batch_size=256, num_train_epochs=14.0, learning_rate=0.001, gradient_accumulation_steps=2, from_scratch=True, warmup_ratio=0.1, adam_beta1=0.9, adam_beta2=0.95, adam_epsilon=1e-08, weight_decay=0.01, lr_scheduler_type='cosine', local_rank=0, resume_from_checkpoint=False, deepspeed=None, peft=False) |
|
Traceback (most recent call last): |
|
File "/juice5/scr5/tthrush/pretraining-coreset-selection/llm_pretraining/train_llm.py", line 204, in <module> |
|
train_model() |
|
File "/juice5/scr5/tthrush/pretraining-coreset-selection/llm_pretraining/train_llm.py", line 164, in train_model |
|
train_dataset = load_from_disk(script_args.dataset_id) |
|
File "/nlp/scr/tthrush/miniconda3/envs/pretraining-coreset-selection/lib/python3.10/site-packages/datasets/load.py", line 2638, in load_from_disk |
|
raise FileNotFoundError( |
|
FileNotFoundError: Directory /juice5/scr5/tthrush/pretraining-coreset-selection/llm_pretraining/test_ordinal_constrained_initial_init_min_threshold/data/arc_easy is neither a `Dataset` directory nor a `DatasetDict` directory. |
|
Traceback (most recent call last): |
|
File "/juice5/scr5/tthrush/pretraining-coreset-selection/llm_pretraining/train_llm.py", line 204, in <module> |
|
train_model() |
|
File "/juice5/scr5/tthrush/pretraining-coreset-selection/llm_pretraining/train_llm.py", line 164, in train_model |
|
train_dataset = load_from_disk(script_args.dataset_id) |
|
File "/nlp/scr/tthrush/miniconda3/envs/pretraining-coreset-selection/lib/python3.10/site-packages/datasets/load.py", line 2638, in load_from_disk |
|
raise FileNotFoundError( |
|
FileNotFoundError: Directory /juice5/scr5/tthrush/pretraining-coreset-selection/llm_pretraining/test_ordinal_constrained_initial_init_min_threshold/data/arc_easy is neither a `Dataset` directory nor a `DatasetDict` directory. |
|
[2024-05-24 11:43:21,865] torch.distributed.elastic.multiprocessing.api: [ERROR] failed (exitcode: 1) local_rank: 0 (pid: 2635429) of binary: /nlp/scr/tthrush/miniconda3/envs/pretraining-coreset-selection/bin/python |
|
Traceback (most recent call last): |
|
File "/nlp/scr/tthrush/miniconda3/envs/pretraining-coreset-selection/bin/torchrun", line 8, in <module> |
|
sys.exit(main()) |
|
File "/nlp/scr/tthrush/miniconda3/envs/pretraining-coreset-selection/lib/python3.10/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 347, in wrapper |
|
return f(*args, **kwargs) |
|
File "/nlp/scr/tthrush/miniconda3/envs/pretraining-coreset-selection/lib/python3.10/site-packages/torch/distributed/run.py", line 812, in main |
|
run(args) |
|
File "/nlp/scr/tthrush/miniconda3/envs/pretraining-coreset-selection/lib/python3.10/site-packages/torch/distributed/run.py", line 803, in run |
|
elastic_launch( |
|
File "/nlp/scr/tthrush/miniconda3/envs/pretraining-coreset-selection/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 135, in __call__ |
|
return launch_agent(self._config, self._entrypoint, list(args)) |
|
File "/nlp/scr/tthrush/miniconda3/envs/pretraining-coreset-selection/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 268, in launch_agent |
|
raise ChildFailedError( |
|
torch.distributed.elastic.multiprocessing.errors.ChildFailedError: |
|
============================================================ |
|
train_llm.py FAILED |
|
------------------------------------------------------------ |
|
Failures: |
|
[1]: |
|
time : 2024-05-24_11:43:21 |
|
host : sphinx2.stanford.edu |
|
rank : 1 (local_rank: 1) |
|
exitcode : 1 (pid: 2635430) |
|
error_file: <N/A> |
|
traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html |
|
------------------------------------------------------------ |
|
Root Cause (first observed failure): |
|
[0]: |
|
time : 2024-05-24_11:43:21 |
|
host : sphinx2.stanford.edu |
|
rank : 0 (local_rank: 0) |
|
exitcode : 1 (pid: 2635429) |
|
error_file: <N/A> |
|
traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html |
|
============================================================ |
|
############################### |
|
end time: 2024-05-24 11:43:29.871511 |
|
elapsed time: 0:00:20.028397 |
|
slurm submission log: 2024-05-24 11:46:18.773501 |
|
created following sbatch script: |
|
|
|
############################### |
|
|
|
#!/bin/bash |
|
|
|
#SBATCH --account=nlp |
|
#SBATCH --cpus-per-task=16 |
|
#SBATCH --dependency=afterok:7648489 |
|
#SBATCH --gres=gpu:2 |
|
#SBATCH --job-name=tthrush-job-1194216 |
|
#SBATCH --mem=100G |
|
#SBATCH --nodelist=sphinx2 |
|
#SBATCH --open-mode=append |
|
#SBATCH --output=/juice5/scr5/tthrush/pretraining-coreset-selection/llm_pretraining/test_ordinal_constrained_initial_init_min_threshold/llms/pythia-70m_arc_easy_1/train_job_output.txt |
|
#SBATCH --partition=sphinx |
|
#SBATCH --time=14-0 |
|
|
|
# activate your desired anaconda environment |
|
. /nlp/scr/tthrush/miniconda3/envs/pretraining-coreset-selection/etc/profile.d/conda.sh ; conda activate pretraining-coreset-selection |
|
|
|
# cd to working directory |
|
cd . |
|
|
|
# launch commands |
|
srun --unbuffered run_as_child_processes 'torchrun --master_port 29527 --nproc_per_node=2 train_llm.py --dataset_id /juice5/scr5/tthrush/pretraining-coreset-selection/llm_pretraining/test_ordinal_constrained_initial_init_min_threshold/data/arc_easy --output_dir /juice5/scr5/tthrush/pretraining-coreset-selection/llm_pretraining/test_ordinal_constrained_initial_init_min_threshold/llms/pythia-70m_arc_easy_1 --output_hub_id pythia-70m_arc_easy --model_id EleutherAI/pythia-70m --learning_rate 1e-3 --warmup_ratio=0.1 --gradient_accumulation_steps 2 --per_device_train_batch_size 256 --seed 1 --num_train_epochs 14' |
|
|
|
############################### |
|
|
|
submission to slurm complete! |
|
|
|
|
|
############################### |
|
slurm submission output |
|
|
|
Submitted batch job 7648490 |
|
|
|
|
|
|
|
############################### |
|
|
|
/var/lib/slurm/slurmd/job7648490/slurm_script: line 16: /nlp/scr/tthrush/miniconda3/envs/pretraining-coreset-selection/etc/profile.d/conda.sh: No such file or directory |
|
|
|
CommandNotFoundError: Your shell has not been properly configured to use 'conda activate'. |
|
To initialize your shell, run |
|
|
|
$ conda init <SHELL_NAME> |
|
|
|
Currently supported shells are: |
|
- bash |
|
- fish |
|
- tcsh |
|
- xonsh |
|
- zsh |
|
- powershell |
|
|
|
See 'conda init --help' for more information and options. |
|
|
|
IMPORTANT: You may need to close and restart your shell after running 'conda init'. |
|
|
|
|
|
############################### |
|
start time: 2024-05-24 13:44:10.592875 |
|
machine: sphinx2 |
|
conda env: pretraining-coreset-selection |
|
############################### |
|
running following processes |
|
|
|
torchrun --master_port 29527 --nproc_per_node=2 train_llm.py --dataset_id /juice5/scr5/tthrush/pretraining-coreset-selection/llm_pretraining/test_ordinal_constrained_initial_init_min_threshold/data/arc_easy --output_dir /juice5/scr5/tthrush/pretraining-coreset-selection/llm_pretraining/test_ordinal_constrained_initial_init_min_threshold/llms/pythia-70m_arc_easy_1 --output_hub_id pythia-70m_arc_easy --model_id EleutherAI/pythia-70m --learning_rate 1e-3 --warmup_ratio=0.1 --gradient_accumulation_steps 2 --per_device_train_batch_size 256 --seed 1 --num_train_epochs 14 |
|
|
|
|
|
############################### |
|
command outputs: |
|
|
|
|
|
[2024-05-24 13:44:11,920] torch.distributed.run: [WARNING] |
|
[2024-05-24 13:44:11,920] torch.distributed.run: [WARNING] ***************************************** |
|
[2024-05-24 13:44:11,920] torch.distributed.run: [WARNING] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. |
|
[2024-05-24 13:44:11,920] torch.distributed.run: [WARNING] ***************************************** |
|
05/24/2024 13:44:15 - INFO - __main__ - Script parameters ScriptArguments(seed=1, dataset_id='/juice5/scr5/tthrush/pretraining-coreset-selection/llm_pretraining/test_ordinal_constrained_initial_init_min_threshold/data/arc_easy', output_dir='/juice5/scr5/tthrush/pretraining-coreset-selection/llm_pretraining/test_ordinal_constrained_initial_init_min_threshold/llms/pythia-70m_arc_easy_1', output_hub_id='pythia-70m_arc_easy', hf_hub_token=True, model_id='EleutherAI/pythia-70m', per_device_train_batch_size=256, num_train_epochs=14.0, learning_rate=0.001, gradient_accumulation_steps=2, from_scratch=True, warmup_ratio=0.1, adam_beta1=0.9, adam_beta2=0.95, adam_epsilon=1e-08, weight_decay=0.01, lr_scheduler_type='cosine', local_rank=0, resume_from_checkpoint=False, deepspeed=None, peft=False) |
|
05/24/2024 13:44:15 - INFO - __main__ - Script parameters ScriptArguments(seed=1, dataset_id='/juice5/scr5/tthrush/pretraining-coreset-selection/llm_pretraining/test_ordinal_constrained_initial_init_min_threshold/data/arc_easy', output_dir='/juice5/scr5/tthrush/pretraining-coreset-selection/llm_pretraining/test_ordinal_constrained_initial_init_min_threshold/llms/pythia-70m_arc_easy_1', output_hub_id='pythia-70m_arc_easy', hf_hub_token=True, model_id='EleutherAI/pythia-70m', per_device_train_batch_size=256, num_train_epochs=14.0, learning_rate=0.001, gradient_accumulation_steps=2, from_scratch=True, warmup_ratio=0.1, adam_beta1=0.9, adam_beta2=0.95, adam_epsilon=1e-08, weight_decay=0.01, lr_scheduler_type='cosine', local_rank=0, resume_from_checkpoint=False, deepspeed=None, peft=False) |
|
0%| | 0/11088 [00:00<?, ?it/s][rank0]:[W reducer.cpp:1360] Warning: find_unused_parameters=True was specified in DDP constructor, but did not find any unused parameters in the forward pass. This flag results in an extra traversal of the autograd graph every iteration, which can adversely affect performance. If your model indeed never has any unused parameters in the forward pass, consider turning this flag off. Note that this warning may be a false positive if your model has flow control causing later iterations to have unused parameters. (function operator()) |
|
[rank1]:[W reducer.cpp:1360] Warning: find_unused_parameters=True was specified in DDP constructor, but did not find any unused parameters in the forward pass. This flag results in an extra traversal of the autograd graph every iteration, which can adversely affect performance. If your model indeed never has any unused parameters in the forward pass, consider turning this flag off. Note that this warning may be a false positive if your model has flow control causing later iterations to have unused parameters. (function operator()) |
|
0%| | 1/11088 [00:05<18:11:48, 5.91s/it]
0%| | 2/11088 [00:07<11:10:40, 3.63s/it]
0%| | 3/11088 [00:09<7:59:56, 2.60s/it]
0%| | 4/11088 [00:10<6:17:01, 2.04s/it]
0%| | 5/11088 [00:11<5:08:34, 1.67s/it]
0%| | 6/11088 [00:12<4:21:17, 1.41s/it]
0%| | 7/11088 [00:13<3:48:51, 1.24s/it]
0%| | 8/11088 [00:14<3:21:15, 1.09s/it]
0%| | 9/11088 [00:14<3:01:54, 1.02it/s]
0%| | 10/11088 [00:15<2:47:43, 1.10it/s]
0%| | 11/11088 [00:16<2:37:04, 1.18it/s]
0%| | 12/11088 [00:16<2:27:00, 1.26it/s]
0%| | 13/11088 [00:17<2:18:02, 1.34it/s]
0%| | 14/11088 [00:18<2:11:40, 1.40it/s]
0%| | 15/11088 [00:18<2:04:46, 1.48it/s]
0%| | 16/11088 [00:19<1:59:30, 1.54it/s]
0%| | 17/11088 [00:19<1:55:27, 1.60it/s]
0%| | 18/11088 [00:20<1:52:06, 1.65it/s]
0%| | 19/11088 [00:21<1:49:51, 1.68it/s]
0%| | 20/11088 [00:21<1:47:13, 1.72it/s]
0%| | 21/11088 [00:22<1:44:37, 1.76it/s]
0%| | 22/11088 [00:22<1:43:03, 1.79it/s]
0%| | 23/11088 [00:23<1:41:58, 1.81it/s]
0%| | 24/11088 [00:23<1:40:25, 1.84it/s]
0%| | 25/11088 [00:24<1:39:43, 1.85it/s]
{'loss': 10.6741, 'grad_norm': 1.357293725013733, 'learning_rate': 2.2542831379621283e-05, 'epoch': 0.03} |
|
0%| | 25/11088 [00:24<1:39:43, 1.85it/s]
0%| | 26/11088 [00:24<1:39:06, 1.86it/s]
0%| | 27/11088 [00:25<1:37:32, 1.89it/s]
0%| | 28/11088 [00:25<1:36:50, 1.90it/s]
0%| | 29/11088 [00:26<1:36:29, 1.91it/s]
0%| | 30/11088 [00:26<1:36:00, 1.92it/s]
0%| | 31/11088 [00:27<1:35:50, 1.92it/s]
0%| | 32/11088 [00:27<1:35:18, 1.93it/s]
0%| | 33/11088 [00:28<1:34:52, 1.94it/s]
0%| | 34/11088 [00:28<1:34:35, 1.95it/s]
0%| | 35/11088 [00:29<1:34:04, 1.96it/s]
0%| | 36/11088 [00:29<1:33:43, 1.97it/s]
0%| | 37/11088 [00:30<1:33:44, 1.96it/s]
0%| | 38/11088 [00:30<1:33:23, 1.97it/s]
0%| | 39/11088 [00:31<1:33:06, 1.98it/s]
0%| | 40/11088 [00:31<1:32:50, 1.98it/s]
0%| | 41/11088 [00:32<1:32:43, 1.99it/s]
0%| | 42/11088 [00:32<1:32:28, 1.99it/s]
0%| | 43/11088 [00:33<1:32:19, 1.99it/s]
0%| | 44/11088 [00:33<1:32:16, 1.99it/s]
0%| | 45/11088 [00:34<1:32:17, 1.99it/s]
0%| | 46/11088 [00:34<1:32:11, 2.00it/s]
0%| | 47/11088 [00:35<1:32:07, 2.00it/s]
0%| | 48/11088 [00:35<1:32:06, 2.00it/s]
0%| | 49/11088 [00:36<1:32:09, 2.00it/s]
0%| | 50/11088 [00:36<1:32:04, 2.00it/s]
{'loss': 9.9577, 'grad_norm': 1.2331771850585938, 'learning_rate': 4.508566275924257e-05, 'epoch': 0.06} |
|
0%| | 50/11088 [00:37<1:32:04, 2.00it/s]
0%| | 51/11088 [00:37<1:31:59, 2.00it/s]
0%| | 52/11088 [00:37<1:31:51, 2.00it/s]
0%| | 53/11088 [00:38<1:31:46, 2.00it/s]
0%| | 54/11088 [00:38<1:31:42, 2.01it/s]
0%| | 55/11088 [00:39<1:31:38, 2.01it/s]
1%| | 56/11088 [00:39<1:31:27, 2.01it/s]
1%| | 57/11088 [00:40<1:31:25, 2.01it/s]
1%| | 58/11088 [00:40<1:31:28, 2.01it/s]
1%| | 59/11088 [00:41<1:31:30, 2.01it/s]
1%| | 60/11088 [00:41<1:31:32, 2.01it/s]
1%| | 61/11088 [00:42<1:31:24, 2.01it/s]
1%| | 62/11088 [00:42<1:31:25, 2.01it/s]
1%| | 63/11088 [00:43<1:31:25, 2.01it/s]
1%| | 64/11088 [00:43<1:31:21, 2.01it/s]
1%| | 65/11088 [00:44<1:31:23, 2.01it/s]
1%| | 66/11088 [00:44<1:31:19, 2.01it/s]
1%| | 67/11088 [00:45<1:31:20, 2.01it/s]
1%| | 68/11088 [00:45<1:31:21, 2.01it/s]
1%| | 69/11088 [00:46<1:31:17, 2.01it/s]
1%| | 70/11088 [00:46<1:31:06, 2.02it/s]
1%| | 71/11088 [00:47<1:31:11, 2.01it/s]
1%| | 72/11088 [00:47<1:31:02, 2.02it/s]
1%| | 73/11088 [00:48<1:31:05, 2.02it/s]
1%| | 74/11088 [00:48<1:31:06, 2.01it/s]
1%| | 75/11088 [00:49<1:31:02, 2.02it/s]
{'loss': 9.2363, 'grad_norm': 1.0550384521484375, 'learning_rate': 6.762849413886383e-05, 'epoch': 0.09} |
|
1%| | 75/11088 [00:49<1:31:02, 2.02it/s]
1%| | 76/11088 [00:49<1:31:06, 2.01it/s]
1%| | 77/11088 [00:50<1:31:08, 2.01it/s]
1%| | 78/11088 [00:50<1:31:05, 2.01it/s]
1%| | 79/11088 [00:51<1:30:55, 2.02it/s]
1%| | 80/11088 [00:51<1:30:56, 2.02it/s]
1%| | 81/11088 [00:52<1:30:54, 2.02it/s]
1%| | 82/11088 [00:52<1:30:55, 2.02it/s]
1%| | 83/11088 [00:53<1:30:58, 2.02it/s]
1%| | 84/11088 [00:53<1:30:54, 2.02it/s]
1%| | 85/11088 [00:54<1:30:57, 2.02it/s]
1%| | 86/11088 [00:54<1:30:52, 2.02it/s]
1%| | 87/11088 [00:55<1:30:51, 2.02it/s]
1%| | 88/11088 [00:55<1:30:47, 2.02it/s]
1%| | 89/11088 [00:56<1:30:49, 2.02it/s]
1%| | 90/11088 [00:56<1:30:45, 2.02it/s]
1%| | 91/11088 [00:57<1:30:43, 2.02it/s]
1%| | 92/11088 [00:57<1:30:44, 2.02it/s]
1%| | 93/11088 [00:58<1:30:42, 2.02it/s]
1%| | 94/11088 [00:58<1:30:40, 2.02it/s]
1%| | 95/11088 [00:59<1:30:36, 2.02it/s]
1%| | 96/11088 [00:59<1:30:42, 2.02it/s]
1%| | 97/11088 [01:00<1:30:37, 2.02it/s]
1%| | 98/11088 [01:00<1:30:39, 2.02it/s]
1%| | 99/11088 [01:01<1:30:41, 2.02it/s]
1%| | 100/11088 [01:01<1:30:42, 2.02it/s]
{'loss': 8.4477, 'grad_norm': 0.7622587084770203, 'learning_rate': 9.017132551848513e-05, 'epoch': 0.13} |
|
1%| | 100/11088 [01:01<1:30:42, 2.02it/s]
1%| | 101/11088 [01:02<1:30:42, 2.02it/s]
1%| | 102/11088 [01:02<1:30:41, 2.02it/s]
1%| | 103/11088 [01:03<1:30:38, 2.02it/s]
1%| | 104/11088 [01:03<1:30:41, 2.02it/s]
1%| | 105/11088 [01:04<1:30:47, 2.02it/s]
1%| | 106/11088 [01:04<1:30:40, 2.02it/s]
1%| | 107/11088 [01:05<1:30:41, 2.02it/s]
1%| | 108/11088 [01:05<1:30:34, 2.02it/s]
1%| | 109/11088 [01:06<1:30:37, 2.02it/s]
1%| | 110/11088 [01:06<1:30:32, 2.02it/s]
1%| | 111/11088 [01:07<1:30:37, 2.02it/s]
1%| | 112/11088 [01:07<1:30:37, 2.02it/s]
1%| | 113/11088 [01:08<1:30:38, 2.02it/s]
1%| | 114/11088 [01:08<1:30:34, 2.02it/s]
1%| | 115/11088 [01:09<1:30:32, 2.02it/s]
1%| | 116/11088 [01:09<1:30:29, 2.02it/s]
1%| | 117/11088 [01:10<1:30:26, 2.02it/s]
1%| | 118/11088 [01:10<1:30:26, 2.02it/s]
1%| | 119/11088 [01:11<1:30:23, 2.02it/s]
1%| | 120/11088 [01:11<1:30:19, 2.02it/s]
1%| | 121/11088 [01:12<1:30:26, 2.02it/s]
1%| | 122/11088 [01:12<1:30:24, 2.02it/s]
1%| | 123/11088 [01:13<1:30:22, 2.02it/s]
1%| | 124/11088 [01:13<1:30:24, 2.02it/s]
1%| | 125/11088 [01:14<1:30:24, 2.02it/s]
{'loss': 7.7892, 'grad_norm': 0.4769901633262634, 'learning_rate': 0.00011271415689810641, 'epoch': 0.16} |
|
1%| | 125/11088 [01:14<1:30:24, 2.02it/s]
1%| | 126/11088 [01:14<1:30:29, 2.02it/s]
1%| | 127/11088 [01:15<1:30:23, 2.02it/s]
1%| | 128/11088 [01:15<1:30:20, 2.02it/s]
1%| | 129/11088 [01:16<1:30:18, 2.02it/s]
1%| | 130/11088 [01:16<1:30:16, 2.02it/s]
1%| | 131/11088 [01:17<1:30:19, 2.02it/s]
1%| | 132/11088 [01:17<1:30:16, 2.02it/s]
1%| | 133/11088 [01:18<1:30:17, 2.02it/s]
1%| | 134/11088 [01:18<1:30:16, 2.02it/s]
1%| | 135/11088 [01:19<1:30:16, 2.02it/s]
1%| | 136/11088 [01:19<1:30:17, 2.02it/s]
1%| | 137/11088 [01:20<1:30:13, 2.02it/s]
1%| | 138/11088 [01:20<1:30:14, 2.02it/s]
1%|β | 139/11088 [01:21<1:30:15, 2.02it/s]
1%|β | 140/11088 [01:21<1:30:16, 2.02it/s]
1%|β | 141/11088 [01:22<1:30:16, 2.02it/s]
1%|β | 142/11088 [01:22<1:30:19, 2.02it/s]
1%|β | 143/11088 [01:23<1:30:15, 2.02it/s]
1%|β | 144/11088 [01:23<1:30:12, 2.02it/s]
1%|β | 145/11088 [01:24<1:30:09, 2.02it/s]
1%|β | 146/11088 [01:24<1:30:08, 2.02it/s]
1%|β | 147/11088 [01:25<1:30:04, 2.02it/s]
1%|β | 148/11088 [01:25<1:30:00, 2.03it/s]
1%|β | 149/11088 [01:26<1:30:04, 2.02it/s]
1%|β | 150/11088 [01:26<1:30:01, 2.02it/s]{'loss': 7.2988, 'grad_norm': 0.42800745368003845, 'learning_rate': 0.00013525698827772766, 'epoch': 0.19}
|
|
1%|β | 150/11088 [01:26<1:30:01, 2.02it/s]
1%|β | 151/11088 [01:27<1:30:07, 2.02it/s]
1%|β | 152/11088 [01:27<1:30:02, 2.02it/s]
1%|β | 153/11088 [01:28<1:30:05, 2.02it/s]
1%|β | 154/11088 [01:28<1:30:04, 2.02it/s]
1%|β | 155/11088 [01:29<1:30:07, 2.02it/s]
1%|β | 156/11088 [01:29<1:29:59, 2.02it/s]
1%|β | 157/11088 [01:30<1:30:05, 2.02it/s]
1%|β | 158/11088 [01:30<1:30:03, 2.02it/s]
1%|β | 159/11088 [01:30<1:30:06, 2.02it/s]
1%|β | 160/11088 [01:31<1:30:00, 2.02it/s]
1%|β | 161/11088 [01:31<1:30:05, 2.02it/s]
1%|β | 162/11088 [01:32<1:29:57, 2.02it/s]
1%|β | 163/11088 [01:32<1:29:58, 2.02it/s]
1%|β | 164/11088 [01:33<1:29:57, 2.02it/s]
1%|β | 165/11088 [01:33<1:29:56, 2.02it/s]
1%|β | 166/11088 [01:34<1:29:56, 2.02it/s]
2%|β | 167/11088 [01:34<1:29:57, 2.02it/s]
2%|β | 168/11088 [01:35<1:29:55, 2.02it/s]
2%|β | 169/11088 [01:35<1:29:50, 2.03it/s]
2%|β | 170/11088 [01:36<1:29:52, 2.02it/s]
2%|β | 171/11088 [01:36<1:29:51, 2.02it/s]
2%|β | 172/11088 [01:37<1:29:53, 2.02it/s]
2%|β | 173/11088 [01:37<1:29:56, 2.02it/s]
2%|β | 174/11088 [01:38<1:29:55, 2.02it/s]
2%|β | 175/11088 [01:38<1:29:55, 2.02it/s]{'loss': 6.8842, 'grad_norm': 0.3714645206928253, 'learning_rate': 0.00015779981965734896, 'epoch': 0.22}
|
|
2%|β | 175/11088 [01:38<1:29:55, 2.02it/s]
2%|β | 176/11088 [01:39<1:30:04, 2.02it/s]
2%|β | 177/11088 [01:39<1:30:01, 2.02it/s]
2%|β | 178/11088 [01:40<1:29:54, 2.02it/s]
2%|β | 179/11088 [01:40<1:29:57, 2.02it/s]
2%|β | 180/11088 [01:41<1:29:50, 2.02it/s]
2%|β | 181/11088 [01:41<1:29:51, 2.02it/s]
2%|β | 182/11088 [01:42<1:29:49, 2.02it/s]
2%|β | 183/11088 [01:42<1:29:47, 2.02it/s]
2%|β | 184/11088 [01:43<1:29:50, 2.02it/s]
2%|β | 185/11088 [01:43<1:29:47, 2.02it/s]
2%|β | 186/11088 [01:44<1:29:50, 2.02it/s]
2%|β | 187/11088 [01:44<1:29:47, 2.02it/s]
2%|β | 188/11088 [01:45<1:29:52, 2.02it/s]
2%|β | 189/11088 [01:45<1:29:53, 2.02it/s]
2%|β | 190/11088 [01:46<1:29:53, 2.02it/s]
2%|β | 191/11088 [01:46<1:29:50, 2.02it/s]
2%|β | 192/11088 [01:47<1:29:49, 2.02it/s]
2%|β | 193/11088 [01:47<1:29:48, 2.02it/s]
2%|β | 194/11088 [01:48<1:29:50, 2.02it/s]
2%|β | 195/11088 [01:48<1:29:51, 2.02it/s]
2%|β | 196/11088 [01:49<1:29:51, 2.02it/s]
2%|β | 197/11088 [01:49<1:29:54, 2.02it/s]
2%|β | 198/11088 [01:50<1:29:49, 2.02it/s]
2%|β | 199/11088 [01:50<1:29:51, 2.02it/s]
2%|β | 200/11088 [01:51<1:29:44, 2.02it/s]
{'loss': 6.5354, 'grad_norm': 0.5754061341285706, 'learning_rate': 0.00018034265103697027, 'epoch': 0.25} |
|
2%|β | 200/11088 [01:51<1:29:44, 2.02it/s]
2%|β | 201/11088 [01:51<1:29:56, 2.02it/s]
2%|β | 202/11088 [01:52<1:29:49, 2.02it/s]
2%|β | 203/11088 [01:52<1:29:53, 2.02it/s]
2%|β | 204/11088 [01:53<1:29:47, 2.02it/s]
2%|β | 205/11088 [01:53<1:29:52, 2.02it/s]
2%|β | 206/11088 [01:54<1:29:42, 2.02it/s]
2%|β | 207/11088 [01:54<1:29:46, 2.02it/s]
2%|β | 208/11088 [01:55<1:29:43, 2.02it/s]
2%|β | 209/11088 [01:55<1:29:45, 2.02it/s]
2%|β | 210/11088 [01:56<1:29:46, 2.02it/s]
2%|β | 211/11088 [01:56<1:29:39, 2.02it/s]
2%|β | 212/11088 [01:57<1:29:33, 2.02it/s]
2%|β | 213/11088 [01:57<1:29:36, 2.02it/s]
2%|β | 214/11088 [01:58<1:29:31, 2.02it/s]
2%|β | 215/11088 [01:58<1:29:33, 2.02it/s]
2%|β | 216/11088 [01:59<1:29:26, 2.03it/s]
2%|β | 217/11088 [01:59<1:29:31, 2.02it/s]
2%|β | 218/11088 [02:00<1:29:27, 2.02it/s]
2%|β | 219/11088 [02:00<1:29:36, 2.02it/s]
2%|β | 220/11088 [02:01<1:29:28, 2.02it/s]
2%|β | 221/11088 [02:01<1:29:33, 2.02it/s]
2%|β | 222/11088 [02:02<1:29:29, 2.02it/s]
2%|β | 223/11088 [02:02<1:29:32, 2.02it/s]
2%|β | 224/11088 [02:03<1:29:24, 2.03it/s]
2%|β | 225/11088 [02:03<1:29:25, 2.02it/s]
{'loss': 6.2653, 'grad_norm': 0.6229713559150696, 'learning_rate': 0.00020288548241659152, 'epoch': 0.28}
|
|
2%|β | 225/11088 [02:03<1:29:25, 2.02it/s]
2%|β | 226/11088 [02:04<1:29:31, 2.02it/s]
2%|β | 227/11088 [02:04<1:29:32, 2.02it/s]
2%|β | 228/11088 [02:05<1:29:25, 2.02it/s]
2%|β | 229/11088 [02:05<1:29:30, 2.02it/s]
2%|β | 230/11088 [02:06<1:29:28, 2.02it/s]
2%|β | 231/11088 [02:06<1:29:30, 2.02it/s]
2%|β | 232/11088 [02:07<1:29:26, 2.02it/s]
2%|β | 233/11088 [02:07<1:29:26, 2.02it/s]
2%|β | 234/11088 [02:08<1:29:21, 2.02it/s]
2%|β | 235/11088 [02:08<1:29:20, 2.02it/s]
2%|β | 236/11088 [02:09<1:29:19, 2.02it/s]
2%|β | 237/11088 [02:09<1:29:14, 2.03it/s]
2%|β | 238/11088 [02:10<1:29:18, 2.02it/s]
2%|β | 239/11088 [02:10<1:29:13, 2.03it/s]
2%|β | 240/11088 [02:11<1:29:16, 2.03it/s]
2%|β | 241/11088 [02:11<1:29:13, 2.03it/s]
2%|β | 242/11088 [02:12<1:29:19, 2.02it/s]
2%|β | 243/11088 [02:12<1:29:11, 2.03it/s]
2%|β | 244/11088 [02:13<1:29:18, 2.02it/s]
2%|β | 245/11088 [02:13<1:29:13, 2.03it/s]
2%|β | 246/11088 [02:14<1:29:19, 2.02it/s]
2%|β | 247/11088 [02:14<1:29:13, 2.02it/s]
2%|β | 248/11088 [02:14<1:29:18, 2.02it/s]
2%|β | 249/11088 [02:15<1:29:14, 2.02it/s]
2%|β | 250/11088 [02:15<1:29:14, 2.02it/s]{'loss': 6.0594, 'grad_norm': 0.6278337836265564, 'learning_rate': 0.00022542831379621282, 'epoch': 0.32}
|
|
2%|β | 250/11088 [02:15<1:29:14, 2.02it/s]
2%|β | 251/11088 [02:16<1:29:20, 2.02it/s]
2%|β | 252/11088 [02:16<1:29:18, 2.02it/s]
2%|β | 253/11088 [02:17<1:29:19, 2.02it/s]
2%|β | 254/11088 [02:17<1:29:18, 2.02it/s]
2%|β | 255/11088 [02:18<1:29:18, 2.02it/s]
2%|β | 256/11088 [02:18<1:29:19, 2.02it/s]
2%|β | 257/11088 [02:19<1:29:20, 2.02it/s]
2%|β | 258/11088 [02:19<1:29:19, 2.02it/s]
2%|β | 259/11088 [02:20<1:29:18, 2.02it/s]
2%|β | 260/11088 [02:20<1:29:15, 2.02it/s]
2%|β | 261/11088 [02:21<1:29:17, 2.02it/s]
2%|β | 262/11088 [02:21<1:29:13, 2.02it/s]
2%|β | 263/11088 [02:22<1:29:14, 2.02it/s]
2%|β | 264/11088 [02:22<1:29:15, 2.02it/s]
2%|β | 265/11088 [02:23<1:29:14, 2.02it/s]
2%|β | 266/11088 [02:23<1:29:14, 2.02it/s]
2%|β | 267/11088 [02:24<1:29:12, 2.02it/s]
2%|β | 268/11088 [02:24<1:29:10, 2.02it/s]
2%|β | 269/11088 [02:25<1:29:03, 2.02it/s]
2%|β | 270/11088 [02:25<1:29:06, 2.02it/s]
2%|β | 271/11088 [02:26<1:28:59, 2.03it/s]
2%|β | 272/11088 [02:26<1:29:05, 2.02it/s]
2%|β | 273/11088 [02:27<1:28:59, 2.03it/s]
2%|β | 274/11088 [02:27<1:29:03, 2.02it/s]
2%|β | 275/11088 [02:28<1:29:01, 2.02it/s]
{'loss': 5.8876, 'grad_norm': 0.6381323337554932, 'learning_rate': 0.00024797114517583407, 'epoch': 0.35}
|
|
2%|β | 275/11088 [02:28<1:29:01, 2.02it/s]
2%|β | 276/11088 [02:28<1:29:07, 2.02it/s]
2%|β | 277/11088 [02:29<1:29:06, 2.02it/s]
3%|β | 278/11088 [02:29<1:29:04, 2.02it/s]
3%|β | 279/11088 [02:30<1:29:05, 2.02it/s]
3%|β | 280/11088 [02:30<1:29:07, 2.02it/s]
3%|β | 281/11088 [02:31<1:29:02, 2.02it/s]
3%|β | 282/11088 [02:31<1:29:06, 2.02it/s]
3%|β | 283/11088 [02:32<1:29:02, 2.02it/s]
3%|β | 284/11088 [02:32<1:29:03, 2.02it/s]
3%|β | 285/11088 [02:33<1:29:00, 2.02it/s]
3%|β | 286/11088 [02:33<1:29:01, 2.02it/s]
3%|β | 287/11088 [02:34<1:29:00, 2.02it/s]
3%|β | 288/11088 [02:34<1:28:59, 2.02it/s]
3%|β | 289/11088 [02:35<1:28:59, 2.02it/s]
3%|β | 290/11088 [02:35<1:28:59, 2.02it/s]
3%|β | 291/11088 [02:36<1:29:01, 2.02it/s]
3%|β | 292/11088 [02:36<1:29:03, 2.02it/s]
3%|β | 293/11088 [02:37<1:29:01, 2.02it/s]
3%|β | 294/11088 [02:37<1:29:00, 2.02it/s]
3%|β | 295/11088 [02:38<1:28:57, 2.02it/s]
3%|β | 296/11088 [02:38<1:28:58, 2.02it/s]
3%|β | 297/11088 [02:39<1:29:00, 2.02it/s]
3%|β | 298/11088 [02:39<1:28:59, 2.02it/s]
3%|β | 299/11088 [02:40<1:28:59, 2.02it/s]
3%|β | 300/11088 [02:40<1:29:01, 2.02it/s]
{'loss': 5.7444, 'grad_norm': 1.03485906124115, 'learning_rate': 0.0002705139765554553, 'epoch': 0.38} |
|
3%|β | 300/11088 [02:40<1:29:01, 2.02it/s]
3%|β | 301/11088 [02:41<1:29:03, 2.02it/s]
3%|β | 302/11088 [02:41<1:28:59, 2.02it/s]
3%|β | 303/11088 [02:42<1:28:59, 2.02it/s]
3%|β | 304/11088 [02:42<1:28:50, 2.02it/s]
3%|β | 305/11088 [02:43<1:28:52, 2.02it/s]
3%|β | 306/11088 [02:43<1:28:46, 2.02it/s]
3%|β | 307/11088 [02:44<1:28:50, 2.02it/s]
3%|β | 308/11088 [02:44<1:28:46, 2.02it/s]
3%|β | 309/11088 [02:45<1:28:46, 2.02it/s]
3%|β | 310/11088 [02:45<1:28:45, 2.02it/s]
3%|β | 311/11088 [02:46<1:28:46, 2.02it/s]
3%|β | 312/11088 [02:46<1:28:50, 2.02it/s]
3%|β | 313/11088 [02:47<1:28:46, 2.02it/s]
3%|β | 314/11088 [02:47<1:28:41, 2.02it/s]
3%|β | 315/11088 [02:48<1:28:44, 2.02it/s]
3%|β | 316/11088 [02:48<1:28:42, 2.02it/s]
3%|β | 317/11088 [02:49<1:28:42, 2.02it/s]
3%|β | 318/11088 [02:49<1:28:41, 2.02it/s]
3%|β | 319/11088 [02:50<1:28:39, 2.02it/s]
3%|β | 320/11088 [02:50<1:28:39, 2.02it/s]
3%|β | 321/11088 [02:51<1:28:39, 2.02it/s]
3%|β | 322/11088 [02:51<1:28:35, 2.03it/s]
3%|β | 323/11088 [02:52<1:28:38, 2.02it/s]
3%|β | 324/11088 [02:52<1:28:32, 2.03it/s]
3%|β | 325/11088 [02:53<1:28:37, 2.02it/s]
{'loss': 5.6197, 'grad_norm': 0.7586981058120728, 'learning_rate': 0.0002930568079350767, 'epoch': 0.41} |
|
3%|β | 325/11088 [02:53<1:28:37, 2.02it/s]
3%|β | 326/11088 [02:53<1:28:40, 2.02it/s]
3%|β | 327/11088 [02:54<1:28:41, 2.02it/s]
3%|β | 328/11088 [02:54<1:28:38, 2.02it/s]
3%|β | 329/11088 [02:55<1:28:38, 2.02it/s]
3%|β | 330/11088 [02:55<1:28:40, 2.02it/s]
3%|β | 331/11088 [02:56<1:28:38, 2.02it/s]
3%|β | 332/11088 [02:56<1:28:36, 2.02it/s]
3%|β | 333/11088 [02:57<1:28:35, 2.02it/s]
3%|β | 334/11088 [02:57<1:28:36, 2.02it/s]
3%|β | 335/11088 [02:58<1:28:38, 2.02it/s]
3%|β | 336/11088 [02:58<1:28:28, 2.03it/s]
3%|β | 337/11088 [02:58<1:28:34, 2.02it/s]
3%|β | 338/11088 [02:59<1:28:34, 2.02it/s]
3%|β | 339/11088 [02:59<1:28:36, 2.02it/s]
3%|β | 340/11088 [03:00<1:28:33, 2.02it/s]
3%|β | 341/11088 [03:00<1:28:37, 2.02it/s]
3%|β | 342/11088 [03:01<1:28:33, 2.02it/s]
3%|β | 343/11088 [03:01<1:28:35, 2.02it/s]
3%|β | 344/11088 [03:02<1:28:29, 2.02it/s]
3%|β | 345/11088 [03:02<1:28:35, 2.02it/s]
3%|β | 346/11088 [03:03<1:28:29, 2.02it/s]
3%|β | 347/11088 [03:03<1:28:34, 2.02it/s]
3%|β | 348/11088 [03:04<1:28:27, 2.02it/s]
3%|β | 349/11088 [03:04<1:28:31, 2.02it/s]
3%|β | 350/11088 [03:05<1:28:25, 2.02it/s]
{'loss': 5.5272, 'grad_norm': 0.6340382099151611, 'learning_rate': 0.0003155996393146979, 'epoch': 0.44} |
|
3%|β | 350/11088 [03:05<1:28:25, 2.02it/s]
3%|β | 351/11088 [03:05<1:28:32, 2.02it/s]
3%|β | 352/11088 [03:06<1:28:25, 2.02it/s]
3%|β | 353/11088 [03:06<1:28:28, 2.02it/s]
3%|β | 354/11088 [03:07<1:28:25, 2.02it/s]
3%|β | 355/11088 [03:07<1:28:26, 2.02it/s]
3%|β | 356/11088 [03:08<1:28:20, 2.02it/s]
3%|β | 357/11088 [03:08<1:28:23, 2.02it/s]
3%|β | 358/11088 [03:09<1:28:20, 2.02it/s]
3%|β | 359/11088 [03:09<1:28:23, 2.02it/s]
3%|β | 360/11088 [03:10<1:28:16, 2.03it/s]
3%|β | 361/11088 [03:10<1:28:23, 2.02it/s]
3%|β | 362/11088 [03:11<1:28:17, 2.02it/s]
3%|β | 363/11088 [03:11<1:28:18, 2.02it/s]
3%|β | 364/11088 [03:12<1:28:20, 2.02it/s]
3%|β | 365/11088 [03:12<1:28:22, 2.02it/s]
3%|β | 366/11088 [03:13<1:28:19, 2.02it/s]
3%|β | 367/11088 [03:13<1:28:21, 2.02it/s]
3%|β | 368/11088 [03:14<1:28:19, 2.02it/s]
3%|β | 369/11088 [03:14<1:28:17, 2.02it/s]
3%|β | 370/11088 [03:15<1:28:23, 2.02it/s]
3%|β | 371/11088 [03:15<1:28:21, 2.02it/s]
3%|β | 372/11088 [03:16<1:28:24, 2.02it/s]
3%|β | 373/11088 [03:16<1:28:29, 2.02it/s]
3%|β | 374/11088 [03:17<1:28:23, 2.02it/s]
3%|β | 375/11088 [03:17<1:28:21, 2.02it/s]
{'loss': 5.4259, 'grad_norm': 0.9262540936470032, 'learning_rate': 0.0003381424706943192, 'epoch': 0.47}
|
|
3%|β | 375/11088 [03:17<1:28:21, 2.02it/s]
3%|β | 376/11088 [03:18<1:28:28, 2.02it/s]
3%|β | 377/11088 [03:18<1:28:25, 2.02it/s]
3%|β | 378/11088 [03:19<1:28:18, 2.02it/s]
3%|β | 379/11088 [03:19<1:28:20, 2.02it/s]
3%|β | 380/11088 [03:20<1:28:13, 2.02it/s]
3%|β | 381/11088 [03:20<1:28:18, 2.02it/s]
3%|β | 382/11088 [03:21<1:28:14, 2.02it/s]
3%|β | 383/11088 [03:21<1:28:18, 2.02it/s]
3%|β | 384/11088 [03:22<1:28:14, 2.02it/s]
3%|β | 385/11088 [03:22<1:28:15, 2.02it/s]
3%|β | 386/11088 [03:23<1:28:11, 2.02it/s]
3%|β | 387/11088 [03:23<1:28:11, 2.02it/s]
3%|β | 388/11088 [03:24<1:28:08, 2.02it/s]
4%|β | 389/11088 [03:24<1:28:14, 2.02it/s]
4%|β | 390/11088 [03:25<1:28:07, 2.02it/s]
4%|β | 391/11088 [03:25<1:28:06, 2.02it/s]
4%|β | 392/11088 [03:26<1:28:06, 2.02it/s]
4%|β | 393/11088 [03:26<1:28:11, 2.02it/s]
4%|β | 394/11088 [03:27<1:28:08, 2.02it/s]
4%|β | 395/11088 [03:27<1:28:12, 2.02it/s]
4%|β | 396/11088 [03:28<1:28:11, 2.02it/s]
4%|β | 397/11088 [03:28<1:28:09, 2.02it/s]
4%|β | 398/11088 [03:29<1:28:06, 2.02it/s]
4%|β | 399/11088 [03:29<1:28:08, 2.02it/s]
4%|β | 400/11088 [03:30<1:28:08, 2.02it/s]{'loss': 5.3414, 'grad_norm': 0.6309112310409546, 'learning_rate': 0.00036068530207394053, 'epoch': 0.51} |
|
4%|β | 400/11088 [03:30<1:28:08, 2.02it/s]
4%|β | 401/11088 [03:30<1:28:09, 2.02it/s]
4%|β | 402/11088 [03:31<1:28:07, 2.02it/s]
4%|β | 403/11088 [03:31<1:28:00, 2.02it/s]
4%|β | 404/11088 [03:32<1:28:06, 2.02it/s]
4%|β | 405/11088 [03:32<1:28:02, 2.02it/s]
4%|β | 406/11088 [03:33<1:28:04, 2.02it/s]
4%|β | 407/11088 [03:33<1:28:01, 2.02it/s]
4%|β | 408/11088 [03:34<1:28:02, 2.02it/s]
4%|β | 409/11088 [03:34<1:28:00, 2.02it/s]
4%|β | 410/11088 [03:35<1:28:00, 2.02it/s]
4%|β | 411/11088 [03:35<1:28:00, 2.02it/s]
4%|β | 412/11088 [03:36<1:28:02, 2.02it/s]
4%|β | 413/11088 [03:36<1:27:57, 2.02it/s]
4%|β | 414/11088 [03:37<1:27:55, 2.02it/s]
4%|β | 415/11088 [03:37<1:27:50, 2.02it/s]
4%|β | 416/11088 [03:38<1:27:52, 2.02it/s]
4%|β | 417/11088 [03:38<1:27:43, 2.03it/s]
4%|β | 418/11088 [03:39<1:27:49, 2.02it/s]
4%|β | 419/11088 [03:39<1:27:45, 2.03it/s]
4%|β | 420/11088 [03:40<1:27:49, 2.02it/s]
4%|β | 421/11088 [03:40<1:27:45, 2.03it/s]
4%|β | 422/11088 [03:41<1:27:38, 2.03it/s]
4%|β | 423/11088 [03:41<1:27:45, 2.03it/s]
4%|β | 424/11088 [03:42<1:27:40, 2.03it/s]
4%|β | 425/11088 [03:42<1:27:47, 2.02it/s]
{'loss': 5.2618, 'grad_norm': 0.6182384490966797, 'learning_rate': 0.0003832281334535618, 'epoch': 0.54} |
|
4%|β | 425/11088 [03:42<1:27:47, 2.02it/s]
4%|β | 426/11088 [03:43<1:27:45, 2.02it/s]
4%|β | 427/11088 [03:43<1:27:46, 2.02it/s]
4%|β | 428/11088 [03:43<1:27:42, 2.03it/s]
4%|β | 429/11088 [03:44<1:27:47, 2.02it/s]
4%|β | 430/11088 [03:44<1:27:43, 2.02it/s]
4%|β | 431/11088 [03:45<1:27:41, 2.03it/s]
4%|β | 432/11088 [03:45<1:27:44, 2.02it/s]
4%|β | 433/11088 [03:46<1:27:36, 2.03it/s]
4%|β | 434/11088 [03:46<1:27:42, 2.02it/s]
4%|β | 435/11088 [03:47<1:27:34, 2.03it/s]
4%|β | 436/11088 [03:47<1:27:41, 2.02it/s]
4%|β | 437/11088 [03:48<1:27:41, 2.02it/s]
4%|β | 438/11088 [03:48<1:27:45, 2.02it/s]
4%|β | 439/11088 [03:49<1:27:40, 2.02it/s]
4%|β | 440/11088 [03:49<1:27:41, 2.02it/s]
4%|β | 441/11088 [03:50<1:27:40, 2.02it/s]
4%|β | 442/11088 [03:50<1:27:47, 2.02it/s]
4%|β | 443/11088 [03:51<1:27:43, 2.02it/s]
4%|β | 444/11088 [03:51<1:27:44, 2.02it/s]
4%|β | 445/11088 [03:52<1:27:35, 2.03it/s]
4%|β | 446/11088 [03:52<1:27:42, 2.02it/s]
4%|β | 447/11088 [03:53<1:27:35, 2.02it/s]
4%|β | 448/11088 [03:53<1:27:33, 2.03it/s]
4%|β | 449/11088 [03:54<1:27:33, 2.03it/s]
4%|β | 450/11088 [03:54<1:27:30, 2.03it/s]
{'loss': 5.1861, 'grad_norm': 0.6628245711326599, 'learning_rate': 0.00040577096483318303, 'epoch': 0.57} |
|
4%|β | 450/11088 [03:54<1:27:30, 2.03it/s]
4%|β | 451/11088 [03:55<1:27:41, 2.02it/s]
4%|β | 452/11088 [03:55<1:27:32, 2.02it/s]
4%|β | 453/11088 [03:56<1:27:35, 2.02it/s]
4%|β | 454/11088 [03:56<1:27:28, 2.03it/s]
4%|β | 455/11088 [03:57<1:27:26, 2.03it/s]
4%|β | 456/11088 [03:57<1:27:28, 2.03it/s]
4%|β | 457/11088 [03:58<1:27:21, 2.03it/s]
4%|β | 458/11088 [03:58<1:27:25, 2.03it/s]
4%|β | 459/11088 [03:59<1:27:23, 2.03it/s]
4%|β | 460/11088 [03:59<1:27:29, 2.02it/s]
4%|β | 461/11088 [04:00<1:27:25, 2.03it/s]
4%|β | 462/11088 [04:00<1:27:27, 2.03it/s]
4%|β | 463/11088 [04:01<1:27:33, 2.02it/s]
4%|β | 464/11088 [04:01<1:27:34, 2.02it/s]
4%|β | 465/11088 [04:02<1:27:34, 2.02it/s]
4%|β | 466/11088 [04:02<1:27:35, 2.02it/s]
4%|β | 467/11088 [04:03<1:27:28, 2.02it/s]
4%|β | 468/11088 [04:03<1:27:31, 2.02it/s]
4%|β | 469/11088 [04:04<1:27:25, 2.02it/s]
4%|β | 470/11088 [04:04<1:27:30, 2.02it/s]
4%|β | 471/11088 [04:05<1:27:25, 2.02it/s]
4%|β | 472/11088 [04:05<1:27:28, 2.02it/s]
4%|β | 473/11088 [04:06<1:27:21, 2.03it/s]
4%|β | 474/11088 [04:06<1:27:26, 2.02it/s]
4%|β | 475/11088 [04:07<1:27:20, 2.03it/s]
{'loss': 5.1232, 'grad_norm': 0.49333590269088745, 'learning_rate': 0.00042831379621280434, 'epoch': 0.6} |
|
4%|β | 475/11088 [04:07<1:27:20, 2.03it/s]
4%|β | 476/11088 [04:07<1:27:29, 2.02it/s]
4%|β | 477/11088 [04:08<1:27:22, 2.02it/s]
4%|β | 478/11088 [04:08<1:27:26, 2.02it/s]
4%|β | 479/11088 [04:09<1:27:21, 2.02it/s]
4%|β | 480/11088 [04:09<1:27:25, 2.02it/s]
4%|β | 481/11088 [04:10<1:27:19, 2.02it/s]
4%|β | 482/11088 [04:10<1:27:20, 2.02it/s]
4%|β | 483/11088 [04:11<1:27:18, 2.02it/s]
4%|β | 484/11088 [04:11<1:27:12, 2.03it/s]
4%|β | 485/11088 [04:12<1:27:14, 2.03it/s]
4%|β | 486/11088 [04:12<1:27:13, 2.03it/s]
4%|β | 487/11088 [04:13<1:27:18, 2.02it/s]
4%|β | 488/11088 [04:13<1:27:13, 2.03it/s]
4%|β | 489/11088 [04:14<1:27:16, 2.02it/s]
4%|β | 490/11088 [04:14<1:27:11, 2.03it/s]
4%|β | 491/11088 [04:15<1:27:12, 2.03it/s]
4%|β | 492/11088 [04:15<1:27:15, 2.02it/s]
4%|β | 493/11088 [04:16<1:27:17, 2.02it/s]
4%|β | 494/11088 [04:16<1:27:12, 2.02it/s]
4%|β | 495/11088 [04:17<1:27:15, 2.02it/s]
4%|β | 496/11088 [04:17<1:27:10, 2.03it/s]
4%|β | 497/11088 [04:18<1:27:13, 2.02it/s]
4%|β | 498/11088 [04:18<1:27:10, 2.02it/s]
5%|β | 499/11088 [04:19<1:27:11, 2.02it/s]
5%|β | 500/11088 [04:19<1:27:10, 2.02it/s]
{'loss': 5.062, 'grad_norm': 0.5728343725204468, 'learning_rate': 0.00045085662759242564, 'epoch': 0.63} |
|
5%|β | 500/11088 [04:19<1:27:10, 2.02it/s]
5%|β | 501/11088 [04:20<1:27:23, 2.02it/s]
5%|β | 502/11088 [04:20<1:27:18, 2.02it/s]
5%|β | 503/11088 [04:21<1:27:18, 2.02it/s]
5%|β | 504/11088 [04:21<1:27:10, 2.02it/s]
5%|β | 505/11088 [04:22<1:27:11, 2.02it/s]
5%|β | 506/11088 [04:22<1:27:09, 2.02it/s]
5%|β | 507/11088 [04:23<1:27:06, 2.02it/s]
5%|β | 508/11088 [04:23<1:27:11, 2.02it/s]
5%|β | 509/11088 [04:24<1:27:12, 2.02it/s]
5%|β | 510/11088 [04:24<1:27:11, 2.02it/s]
5%|β | 511/11088 [04:24<1:27:10, 2.02it/s]
5%|β | 512/11088 [04:25<1:27:10, 2.02it/s]
5%|β | 513/11088 [04:25<1:27:04, 2.02it/s]
5%|β | 514/11088 [04:26<1:27:06, 2.02it/s]
5%|β | 515/11088 [04:26<1:26:56, 2.03it/s]
5%|β | 516/11088 [04:27<1:27:00, 2.03it/s]
5%|β | 517/11088 [04:27<1:26:56, 2.03it/s]
5%|β | 518/11088 [04:28<1:27:00, 2.02it/s]
5%|β | 519/11088 [04:28<1:27:00, 2.02it/s]
5%|β | 520/11088 [04:29<1:26:56, 2.03it/s]
5%|β | 521/11088 [04:29<1:27:02, 2.02it/s]
5%|β | 522/11088 [04:30<1:26:59, 2.02it/s]
5%|β | 523/11088 [04:30<1:27:00, 2.02it/s]
5%|β | 524/11088 [04:31<1:26:49, 2.03it/s]
5%|β | 525/11088 [04:31<1:26:52, 2.03it/s]{'loss': 5.0021, 'grad_norm': 0.5581721067428589, 'learning_rate': 0.0004733994589720469, 'epoch': 0.66}
|
|
5%|β | 525/11088 [04:31<1:26:52, 2.03it/s]
5%|β | 526/11088 [04:32<1:26:54, 2.03it/s]
5%|β | 527/11088 [04:32<1:26:56, 2.02it/s]
5%|β | 528/11088 [04:33<1:26:52, 2.03it/s]
5%|β | 529/11088 [04:33<1:26:57, 2.02it/s]
5%|β | 530/11088 [04:34<1:26:56, 2.02it/s]
5%|β | 531/11088 [04:34<1:26:58, 2.02it/s]
5%|β | 532/11088 [04:35<1:26:56, 2.02it/s]
5%|β | 533/11088 [04:35<1:26:54, 2.02it/s]
5%|β | 534/11088 [04:36<1:26:55, 2.02it/s]
5%|β | 535/11088 [04:36<1:26:55, 2.02it/s]
5%|β | 536/11088 [04:37<1:26:54, 2.02it/s]
5%|β | 537/11088 [04:37<1:26:52, 2.02it/s]
5%|β | 538/11088 [04:38<1:26:49, 2.03it/s]
5%|β | 539/11088 [04:38<1:26:51, 2.02it/s]
5%|β | 540/11088 [04:39<1:26:46, 2.03it/s]
5%|β | 541/11088 [04:39<1:26:48, 2.02it/s]
5%|β | 542/11088 [04:40<1:26:47, 2.03it/s]
5%|β | 543/11088 [04:40<1:26:49, 2.02it/s]
5%|β | 544/11088 [04:41<1:26:49, 2.02it/s]
5%|β | 545/11088 [04:41<1:26:47, 2.02it/s]
5%|β | 546/11088 [04:42<1:26:48, 2.02it/s]
5%|β | 547/11088 [04:42<1:26:51, 2.02it/s]
5%|β | 548/11088 [04:43<1:26:50, 2.02it/s]
5%|β | 549/11088 [04:43<1:26:43, 2.03it/s]
5%|β | 550/11088 [04:44<1:26:39, 2.03it/s]
{'loss': 4.9501, 'grad_norm': 0.5733642578125, 'learning_rate': 0.0004959422903516681, 'epoch': 0.69} |
|
5%|β | 550/11088 [04:44<1:26:39, 2.03it/s]
5%|β | 551/11088 [04:44<1:26:41, 2.03it/s]
5%|β | 552/11088 [04:45<1:26:44, 2.02it/s]
5%|β | 553/11088 [04:45<1:26:44, 2.02it/s]
5%|β | 554/11088 [04:46<1:26:42, 2.02it/s]
5%|β | 555/11088 [04:46<1:26:39, 2.03it/s]
5%|β | 556/11088 [04:47<1:26:41, 2.02it/s]
5%|β | 557/11088 [04:47<1:26:41, 2.02it/s]
5%|β | 558/11088 [04:48<1:26:40, 2.02it/s]
5%|β | 559/11088 [04:48<1:26:41, 2.02it/s]
5%|β | 560/11088 [04:49<1:26:41, 2.02it/s]
5%|β | 561/11088 [04:49<1:26:37, 2.03it/s]
5%|β | 562/11088 [04:50<1:26:37, 2.03it/s]
5%|β | 563/11088 [04:50<1:26:34, 2.03it/s]
5%|β | 564/11088 [04:51<1:26:36, 2.03it/s]
5%|β | 565/11088 [04:51<1:26:35, 2.03it/s]
5%|β | 566/11088 [04:52<1:26:38, 2.02it/s]
5%|β | 567/11088 [04:52<1:26:41, 2.02it/s]
5%|β | 568/11088 [04:53<1:26:37, 2.02it/s]
5%|β | 569/11088 [04:53<1:26:38, 2.02it/s]
5%|β | 570/11088 [04:54<1:26:40, 2.02it/s]
5%|β | 571/11088 [04:54<1:26:39, 2.02it/s]
5%|β | 572/11088 [04:55<1:26:38, 2.02it/s]
5%|β | 573/11088 [04:55<1:26:32, 2.03it/s]
5%|β | 574/11088 [04:56<1:26:31, 2.03it/s]
5%|β | 575/11088 [04:56<1:26:29, 2.03it/s]{'loss': 4.9062, 'grad_norm': 0.7264406085014343, 'learning_rate': 0.0005184851217312895, 'epoch': 0.73}
|
|
5%|β | 575/11088 [04:56<1:26:29, 2.03it/s]
5%|β | 576/11088 [04:57<1:26:32, 2.02it/s]
5%|β | 577/11088 [04:57<1:26:31, 2.02it/s]
5%|β | 578/11088 [04:58<1:26:29, 2.03it/s]
5%|β | 579/11088 [04:58<1:26:30, 2.02it/s]
5%|β | 580/11088 [04:59<1:26:23, 2.03it/s]
5%|β | 581/11088 [04:59<1:26:27, 2.03it/s]
5%|β | 582/11088 [05:00<1:26:28, 2.02it/s]
5%|β | 583/11088 [05:00<1:26:21, 2.03it/s]
5%|β | 584/11088 [05:01<1:26:26, 2.03it/s]
5%|β | 585/11088 [05:01<1:26:23, 2.03it/s]
5%|β | 586/11088 [05:02<1:26:26, 2.02it/s]
5%|β | 587/11088 [05:02<1:26:24, 2.03it/s]
5%|β | 588/11088 [05:03<1:26:25, 2.02it/s]
5%|β | 589/11088 [05:03<1:26:27, 2.02it/s]
5%|β | 590/11088 [05:04<1:26:28, 2.02it/s]
5%|β | 591/11088 [05:04<1:26:27, 2.02it/s]
5%|β | 592/11088 [05:05<1:26:21, 2.03it/s]
5%|β | 593/11088 [05:05<1:26:26, 2.02it/s]
5%|β | 594/11088 [05:05<1:26:20, 2.03it/s]
5%|β | 595/11088 [05:06<1:26:23, 2.02it/s]
5%|β | 596/11088 [05:06<1:26:20, 2.03it/s]
5%|β | 597/11088 [05:07<1:26:22, 2.02it/s]
5%|β | 598/11088 [05:07<1:26:20, 2.03it/s]
5%|β | 599/11088 [05:08<1:26:21, 2.02it/s]
5%|β | 600/11088 [05:08<1:26:23, 2.02it/s]
{'loss': 4.8559, 'grad_norm': 0.4734659790992737, 'learning_rate': 0.0005410279531109106, 'epoch': 0.76} |
|
5%|β | 600/11088 [05:08<1:26:23, 2.02it/s]
5%|β | 601/11088 [05:09<1:26:29, 2.02it/s]
5%|β | 602/11088 [05:09<1:26:28, 2.02it/s]
5%|β | 603/11088 [05:10<1:26:26, 2.02it/s]
5%|β | 604/11088 [05:10<1:26:18, 2.02it/s]
5%|β | 605/11088 [05:11<1:26:20, 2.02it/s]
5%|β | 606/11088 [05:11<1:26:16, 2.02it/s]
5%|β | 607/11088 [05:12<1:26:16, 2.02it/s]
5%|β | 608/11088 [05:12<1:26:13, 2.03it/s]
5%|β | 609/11088 [05:13<1:26:14, 2.03it/s]
6%|β | 610/11088 [05:13<1:26:15, 2.02it/s]
6%|β | 611/11088 [05:14<1:26:10, 2.03it/s]
6%|β | 612/11088 [05:14<1:26:15, 2.02it/s]
6%|β | 613/11088 [05:15<1:26:08, 2.03it/s]
6%|β | 614/11088 [05:15<1:26:14, 2.02it/s]
6%|β | 615/11088 [05:16<1:33:43, 1.86it/s]
6%|β | 616/11088 [05:17<1:38:54, 1.76it/s]
6%|β | 617/11088 [05:17<1:35:01, 1.84it/s]
6%|β | 618/11088 [05:18<1:32:24, 1.89it/s]
6%|β | 619/11088 [05:18<1:30:29, 1.93it/s]
6%|β | 620/11088 [05:19<1:29:11, 1.96it/s]
6%|β | 621/11088 [05:19<1:28:14, 1.98it/s]
6%|β | 622/11088 [05:20<1:27:41, 1.99it/s]
6%|β | 623/11088 [05:20<1:27:13, 2.00it/s]
6%|β | 624/11088 [05:21<1:26:52, 2.01it/s]
6%|β | 625/11088 [05:21<1:26:38, 2.01it/s]
{'loss': 4.8127, 'grad_norm': 0.5146079063415527, 'learning_rate': 0.000563570784490532, 'epoch': 0.79} |
|
6%|β | 625/11088 [05:21<1:26:38, 2.01it/s]
6%|β | 626/11088 [05:22<1:26:38, 2.01it/s]
6%|β | 627/11088 [05:22<1:26:28, 2.02it/s]
6%|β | 628/11088 [05:23<1:26:23, 2.02it/s]
6%|β | 629/11088 [05:23<1:26:17, 2.02it/s]
6%|β | 630/11088 [05:24<1:26:15, 2.02it/s]
6%|β | 631/11088 [05:24<1:26:13, 2.02it/s]
6%|β | 632/11088 [05:25<1:26:17, 2.02it/s]
6%|β | 633/11088 [05:25<1:26:11, 2.02it/s]
6%|β | 634/11088 [05:26<1:26:09, 2.02it/s]
6%|β | 635/11088 [05:26<1:26:06, 2.02it/s]
6%|β | 636/11088 [05:27<1:26:09, 2.02it/s]
6%|β | 637/11088 [05:27<1:26:08, 2.02it/s]
6%|β | 638/11088 [05:28<1:26:06, 2.02it/s]
6%|β | 639/11088 [05:28<1:26:05, 2.02it/s]
6%|β | 640/11088 [05:29<1:26:07, 2.02it/s]
6%|β | 641/11088 [05:29<1:26:05, 2.02it/s]
6%|β | 642/11088 [05:29<1:26:03, 2.02it/s]
6%|β | 643/11088 [05:30<1:26:05, 2.02it/s]
6%|β | 644/11088 [05:30<1:26:04, 2.02it/s]
6%|β | 645/11088 [05:31<1:26:03, 2.02it/s]
6%|β | 646/11088 [05:31<1:26:02, 2.02it/s]
6%|β | 647/11088 [05:32<1:26:00, 2.02it/s]
6%|β | 648/11088 [05:32<1:26:01, 2.02it/s]
6%|β | 649/11088 [05:33<1:26:04, 2.02it/s]
6%|β | 650/11088 [05:33<1:26:03, 2.02it/s]
{'loss': 4.7747, 'grad_norm': 0.5695474743843079, 'learning_rate': 0.0005861136158701534, 'epoch': 0.82} |
|
6%|β | 650/11088 [05:33<1:26:03, 2.02it/s]
6%|β | 651/11088 [05:34<1:26:08, 2.02it/s]
6%|β | 652/11088 [05:34<1:26:03, 2.02it/s]
6%|β | 653/11088 [05:35<1:26:03, 2.02it/s]
6%|β | 654/11088 [05:35<1:26:02, 2.02it/s]
6%|β | 655/11088 [05:36<1:26:03, 2.02it/s]
6%|β | 656/11088 [05:36<1:25:59, 2.02it/s]
6%|β | 657/11088 [05:37<1:25:58, 2.02it/s]
6%|β | 658/11088 [05:37<1:25:57, 2.02it/s]
6%|β | 659/11088 [05:38<1:25:57, 2.02it/s]
6%|β | 660/11088 [05:38<1:25:57, 2.02it/s]
6%|β | 661/11088 [05:39<1:25:55, 2.02it/s]
6%|β | 662/11088 [05:39<1:25:57, 2.02it/s]
6%|β | 663/11088 [05:40<1:25:50, 2.02it/s]
6%|β | 664/11088 [05:40<1:25:50, 2.02it/s]
6%|β | 665/11088 [05:41<1:25:42, 2.03it/s]
6%|β | 666/11088 [05:41<1:25:49, 2.02it/s]
6%|β | 667/11088 [05:42<1:25:42, 2.03it/s]
6%|β | 668/11088 [05:42<1:25:43, 2.03it/s]
6%|β | 669/11088 [05:43<1:25:39, 2.03it/s]
6%|β | 670/11088 [05:43<1:25:36, 2.03it/s]
6%|β | 671/11088 [05:44<1:25:38, 2.03it/s]
6%|β | 672/11088 [05:44<1:25:34, 2.03it/s]
6%|β | 673/11088 [05:45<1:25:33, 2.03it/s]
6%|β | 674/11088 [05:45<1:25:35, 2.03it/s]
6%|β | 675/11088 [05:46<1:25:31, 2.03it/s]
{'loss': 4.736, 'grad_norm': 0.4409935176372528, 'learning_rate': 0.0006086564472497745, 'epoch': 0.85} |
|
6%|β | 675/11088 [05:46<1:25:31, 2.03it/s]
6%|β | 676/11088 [05:46<1:25:39, 2.03it/s]
6%|β | 677/11088 [05:47<1:25:33, 2.03it/s]
6%|β | 678/11088 [05:47<1:25:38, 2.03it/s]
6%|β | 679/11088 [05:48<1:25:37, 2.03it/s]
6%|β | 680/11088 [05:48<1:25:37, 2.03it/s]
6%|β | 681/11088 [05:49<1:25:35, 2.03it/s]
6%|β | 682/11088 [05:49<1:25:39, 2.02it/s]
6%|β | 683/11088 [05:50<1:25:38, 2.03it/s]
6%|β | 684/11088 [05:50<1:25:43, 2.02it/s]
6%|β | 685/11088 [05:51<1:25:42, 2.02it/s]
6%|β | 686/11088 [05:51<1:25:38, 2.02it/s]
6%|β | 687/11088 [05:52<1:25:34, 2.03it/s]
6%|β | 688/11088 [05:52<1:25:31, 2.03it/s]
6%|β | 689/11088 [05:53<1:25:31, 2.03it/s]
6%|β | 690/11088 [05:53<1:25:30, 2.03it/s]
6%|β | 691/11088 [05:54<1:25:30, 2.03it/s]
6%|β | 692/11088 [05:54<1:25:32, 2.03it/s]
6%|β | 693/11088 [05:55<1:25:37, 2.02it/s]
6%|β | 694/11088 [05:55<1:25:36, 2.02it/s]
6%|β | 695/11088 [05:56<1:25:37, 2.02it/s]
6%|β | 696/11088 [05:56<1:25:33, 2.02it/s]
6%|β | 697/11088 [05:57<1:25:32, 2.02it/s]
6%|β | 698/11088 [05:57<1:25:27, 2.03it/s]
6%|β | 699/11088 [05:58<1:25:28, 2.03it/s]
6%|β | 700/11088 [05:58<1:25:28, 2.03it/s]
{'loss': 4.7106, 'grad_norm': 0.5907058715820312, 'learning_rate': 0.0006311992786293959, 'epoch': 0.88}
|
|
6%|β | 700/11088 [05:58<1:25:28, 2.03it/s]
6%|β | 701/11088 [05:59<1:25:37, 2.02it/s]
6%|β | 702/11088 [05:59<1:25:35, 2.02it/s]
6%|β | 703/11088 [06:00<1:25:34, 2.02it/s]
6%|β | 704/11088 [06:00<1:25:31, 2.02it/s]
6%|β | 705/11088 [06:01<1:25:32, 2.02it/s]
6%|β | 706/11088 [06:01<1:25:32, 2.02it/s]
6%|β | 707/11088 [06:02<1:25:30, 2.02it/s]
6%|β | 708/11088 [06:02<1:25:31, 2.02it/s]
6%|β | 709/11088 [06:03<1:25:34, 2.02it/s]
6%|β | 710/11088 [06:03<1:25:26, 2.02it/s]
6%|β | 711/11088 [06:04<1:25:32, 2.02it/s]
6%|β | 712/11088 [06:04<1:25:25, 2.02it/s]
6%|β | 713/11088 [06:05<1:25:26, 2.02it/s]
6%|β | 714/11088 [06:05<1:25:23, 2.02it/s]
6%|β | 715/11088 [06:06<1:25:20, 2.03it/s]
6%|β | 716/11088 [06:06<1:25:24, 2.02it/s]
6%|β | 717/11088 [06:07<1:25:18, 2.03it/s]
6%|β | 718/11088 [06:07<1:25:22, 2.02it/s]
6%|β | 719/11088 [06:08<1:25:21, 2.02it/s]
6%|β | 720/11088 [06:08<1:25:20, 2.02it/s]
7%|β | 721/11088 [06:09<1:25:21, 2.02it/s]
7%|β | 722/11088 [06:09<1:25:21, 2.02it/s]
7%|β | 723/11088 [06:10<1:25:24, 2.02it/s]
7%|β | 724/11088 [06:10<1:25:21, 2.02it/s]
7%|β | 725/11088 [06:11<1:25:18, 2.02it/s]
{'loss': 4.6677, 'grad_norm': 0.5601474642753601, 'learning_rate': 0.0006537421100090172, 'epoch': 0.92} |
|
7%|β | 725/11088 [06:11<1:25:18, 2.02it/s]
7%|β | 726/11088 [06:11<1:25:18, 2.02it/s]
7%|β | 727/11088 [06:11<1:25:17, 2.02it/s]
7%|β | 728/11088 [06:12<1:25:17, 2.02it/s]
7%|β | 729/11088 [06:12<1:25:12, 2.03it/s]
7%|β | 730/11088 [06:13<1:25:16, 2.02it/s]
7%|β | 731/11088 [06:13<1:25:08, 2.03it/s]
7%|β | 732/11088 [06:14<1:25:11, 2.03it/s]
7%|β | 733/11088 [06:14<1:25:11, 2.03it/s]
7%|β | 734/11088 [06:15<1:25:13, 2.02it/s]
7%|β | 735/11088 [06:15<1:25:12, 2.03it/s]
7%|β | 736/11088 [06:16<1:25:12, 2.02it/s]
7%|β | 737/11088 [06:16<1:25:18, 2.02it/s]
7%|β | 738/11088 [06:17<1:25:18, 2.02it/s]
7%|β | 739/11088 [06:17<1:25:19, 2.02it/s]
7%|β | 740/11088 [06:18<1:25:15, 2.02it/s]
7%|β | 741/11088 [06:18<1:25:15, 2.02it/s]
7%|β | 742/11088 [06:19<1:25:10, 2.02it/s]
7%|β | 743/11088 [06:19<1:25:07, 2.03it/s]
7%|β | 744/11088 [06:20<1:25:10, 2.02it/s]
7%|β | 745/11088 [06:20<1:25:08, 2.02it/s]
7%|β | 746/11088 [06:21<1:25:07, 2.02it/s]
7%|β | 747/11088 [06:21<1:25:07, 2.02it/s]
7%|β | 748/11088 [06:22<1:25:09, 2.02it/s]
7%|β | 749/11088 [06:22<1:25:09, 2.02it/s]
7%|β | 750/11088 [06:23<1:25:08, 2.02it/s]
{'loss': 4.6427, 'grad_norm': 0.5141180753707886, 'learning_rate': 0.0006762849413886384, 'epoch': 0.95} |
|
7%|β | 750/11088 [06:23<1:25:08, 2.02it/s]
7%|β | 751/11088 [06:23<1:25:10, 2.02it/s]
7%|β | 752/11088 [06:24<1:25:13, 2.02it/s]
7%|β | 753/11088 [06:24<1:25:09, 2.02it/s]
7%|β | 754/11088 [06:25<1:25:09, 2.02it/s]
7%|β | 755/11088 [06:25<1:25:10, 2.02it/s]
7%|β | 756/11088 [06:26<1:25:07, 2.02it/s]
7%|β | 757/11088 [06:26<1:25:07, 2.02it/s]
7%|β | 758/11088 [06:27<1:25:06, 2.02it/s]
7%|β | 759/11088 [06:27<1:25:04, 2.02it/s]
7%|β | 760/11088 [06:28<1:25:03, 2.02it/s]
7%|β | 761/11088 [06:28<1:25:04, 2.02it/s]
7%|β | 762/11088 [06:29<1:25:02, 2.02it/s]
7%|β | 763/11088 [06:29<1:25:06, 2.02it/s]
7%|β | 764/11088 [06:30<1:25:03, 2.02it/s]
7%|β | 765/11088 [06:30<1:25:04, 2.02it/s]
7%|β | 766/11088 [06:31<1:25:01, 2.02it/s]
7%|β | 767/11088 [06:31<1:25:01, 2.02it/s]
7%|β | 768/11088 [06:32<1:25:03, 2.02it/s]
7%|β | 769/11088 [06:32<1:25:01, 2.02it/s]
7%|β | 770/11088 [06:33<1:24:59, 2.02it/s]
7%|β | 771/11088 [06:33<1:24:58, 2.02it/s]
7%|β | 772/11088 [06:34<1:24:55, 2.02it/s]
7%|β | 773/11088 [06:34<1:24:59, 2.02it/s]
7%|β | 774/11088 [06:35<1:24:57, 2.02it/s]
7%|β | 775/11088 [06:35<1:24:53, 2.02it/s]
{'loss': 4.6105, 'grad_norm': 0.38639992475509644, 'learning_rate': 0.0006988277727682597, 'epoch': 0.98} |
|
7%|β | 775/11088 [06:35<1:24:53, 2.02it/s]
7%|β | 776/11088 [06:36<1:25:05, 2.02it/s]
7%|β | 777/11088 [06:36<1:25:02, 2.02it/s]
7%|β | 778/11088 [06:37<1:25:04, 2.02it/s]
7%|β | 779/11088 [06:37<1:25:02, 2.02it/s]
7%|β | 780/11088 [06:38<1:24:58, 2.02it/s]
7%|β | 781/11088 [06:38<1:24:58, 2.02it/s]
7%|β | 782/11088 [06:39<1:24:55, 2.02it/s]
7%|β | 783/11088 [06:39<1:24:54, 2.02it/s]
7%|β | 784/11088 [06:40<1:24:56, 2.02it/s]
7%|β | 785/11088 [06:40<1:24:52, 2.02it/s]
7%|β | 786/11088 [06:41<1:24:45, 2.03it/s]
7%|β | 787/11088 [06:41<1:24:46, 2.03it/s]
7%|β | 788/11088 [06:42<1:24:47, 2.02it/s]
7%|β | 789/11088 [06:42<1:24:44, 2.03it/s]
7%|β | 790/11088 [06:43<1:24:44, 2.03it/s]
7%|β | 791/11088 [06:43<1:24:50, 2.02it/s]
7%|β | 792/11088 [06:44<1:24:18, 2.04it/s]
7%|β | 793/11088 [06:56<11:27:00, 4.00s/it]
7%|β | 794/11088 [06:56<8:26:43, 2.95s/it]
7%|β | 795/11088 [06:57<6:20:05, 2.22s/it]
7%|β | 796/11088 [06:57<4:51:46, 1.70s/it]
7%|β | 797/11088 [06:58<3:49:43, 1.34s/it]
7%|β | 798/11088 [06:58<3:06:16, 1.09s/it]
7%|β | 799/11088 [06:59<2:35:58, 1.10it/s]
7%|β | 800/11088 [06:59<2:14:34, 1.27it/s]
{'loss': 4.5759, 'grad_norm': 0.49646249413490295, 'learning_rate': 0.0007213706041478811, 'epoch': 1.01} |
|
7%|β | 800/11088 [06:59<2:14:34, 1.27it/s]
7%|β | 801/11088 [07:00<1:59:48, 1.43it/s]
7%|β | 802/11088 [07:00<1:49:10, 1.57it/s]
7%|β | 803/11088 [07:01<1:41:58, 1.68it/s]
7%|β | 804/11088 [07:01<1:36:45, 1.77it/s]
7%|β | 805/11088 [07:02<1:33:16, 1.84it/s]
7%|β | 806/11088 [07:02<1:30:40, 1.89it/s]
7%|β | 807/11088 [07:03<1:28:48, 1.93it/s]
7%|β | 808/11088 [07:03<1:27:32, 1.96it/s]
7%|β | 809/11088 [07:04<1:26:51, 1.97it/s]
7%|β | 810/11088 [07:04<1:26:09, 1.99it/s]
7%|β | 811/11088 [07:05<1:25:40, 2.00it/s]
7%|β | 812/11088 [07:05<1:25:23, 2.01it/s]
7%|β | 813/11088 [07:06<1:25:16, 2.01it/s]
7%|β | 814/11088 [07:06<1:25:02, 2.01it/s]
7%|β | 815/11088 [07:07<1:25:01, 2.01it/s]
7%|β | 816/11088 [07:07<1:24:50, 2.02it/s]
7%|β | 817/11088 [07:08<1:24:52, 2.02it/s]
7%|β | 818/11088 [07:08<1:24:55, 2.02it/s]
7%|β | 819/11088 [07:09<1:24:49, 2.02it/s] |