rwkv-trainingtest / train1 /train_log.txt
Xiami2000's picture
Upload folder using huggingface_hub
e62482f
raw
history blame
14.5 kB
NEW RUN 2023-12-05-05-15-15
{'load_model': '/content/RWKV-LM-LoRA/RWKV-v4neo/RWKV-4-7B-world-one-novel-tuned-65k.pth', 'wandb': '', 'proj_dir': '/content/RWKV-LM-LoRA/RWKV-v4neo/lora/', 'random_seed': -1, 'data_file': '/content/RWKV-LM-LoRA/RWKV-v4neo/data_text_document', 'data_type': 'binidx', 'vocab_size': 65536, 'ctx_len': 1024, 'epoch_steps': 5000, 'epoch_count': 1000, 'epoch_begin': 0, 'epoch_save': 1, 'micro_bsz': 2, 'n_layer': 32, 'n_embd': 4096, 'dim_att': 4096, 'dim_ffn': 16384, 'pre_ffn': 0, 'head_qk': 0, 'tiny_att_dim': 0, 'tiny_att_layer': -999, 'lr_init': 5e-05, 'lr_final': 1e-05, 'warmup_steps': 0, 'beta1': 0.9, 'beta2': 0.999, 'adam_eps': 1e-08, 'grad_cp': 0, 'my_pile_stage': 0, 'my_pile_shift': -1, 'my_pile_edecay': 0, 'layerwise_lr': 1, 'ds_bucket_mb': 200, 'my_img_version': 0, 'my_img_size': 0, 'my_img_bit': 0, 'my_img_clip': 'x', 'my_img_clip_scale': 1, 'my_img_l1_scale': 0, 'my_img_encoder': 'x', 'my_sample_len': 0, 'my_ffn_shift': 1, 'my_att_shift': 1, 'my_pos_emb': 0, 'load_partial': 0, 'magic_prime': 0, 'my_qa_mask': 0, 'my_testing': '', 'lora': True, 'lora_load': '', 'lora_r': 768, 'lora_alpha': 512.0, 'lora_dropout': 0.01, 'lora_parts': 'att,ln,time', 'logger': False, 'enable_checkpointing': False, 'default_root_dir': None, 'gradient_clip_val': 1.0, 'gradient_clip_algorithm': None, 'num_nodes': 1, 'num_processes': None, 'devices': '1', 'gpus': None, 'auto_select_gpus': None, 'tpu_cores': None, 'ipus': None, 'enable_progress_bar': True, 'overfit_batches': 0.0, 'track_grad_norm': -1, 'check_val_every_n_epoch': 100000000000000000000, 'fast_dev_run': False, 'accumulate_grad_batches': 4, 'max_epochs': -1, 'min_epochs': None, 'max_steps': -1, 'min_steps': None, 'max_time': None, 'limit_train_batches': None, 'limit_val_batches': None, 'limit_test_batches': None, 'limit_predict_batches': None, 'val_check_interval': None, 'log_every_n_steps': 100000000000000000000, 'accelerator': 'gpu', 'strategy': 'deepspeed_stage_2', 'sync_batchnorm': False, 'precision': 'bf16', 'enable_model_summary': True, 'num_sanity_val_steps': 0, 'resume_from_checkpoint': None, 'profiler': None, 'benchmark': None, 'reload_dataloaders_every_n_epochs': 0, 'auto_lr_find': False, 'replace_sampler_ddp': False, 'detect_anomaly': False, 'auto_scale_batch_size': False, 'plugins': None, 'amp_backend': None, 'amp_level': None, 'move_metrics_to_cpu': False, 'multiple_trainloader_mode': 'max_size_cycle', 'inference_mode': True, 'my_timestamp': '2023-12-05-05-15-15', 'betas': (0.9, 0.999), 'real_bsz': 2, 'run_name': '65536 ctx1024 L32 D4096'}
{'zero_allow_untested_optimizer': True, 'zero_optimization': {'stage': 2, 'contiguous_gradients': True, 'overlap_comm': True, 'allgather_partitions': True, 'reduce_scatter': True, 'allgather_bucket_size': 200000000, 'reduce_bucket_size': 200000000, 'sub_group_size': 1000000000000}, 'activation_checkpointing': {'partition_activations': False, 'cpu_checkpointing': False, 'contiguous_memory_optimization': False, 'synchronize_checkpoint_boundary': False}, 'aio': {'block_size': 1048576, 'queue_depth': 8, 'single_submit': False, 'overlap_events': True, 'thread_count': 1}, 'gradient_accumulation_steps': 4, 'train_micro_batch_size_per_gpu': 2, 'gradient_clipping': 1.0, 'bf16': {'enabled': True}}
NEW RUN 2023-12-05-05-15-15
{'load_model': '/content/RWKV-LM-LoRA/RWKV-v4neo/RWKV-4-7B-world-one-novel-tuned-65k.pth', 'wandb': '', 'proj_dir': '/content/RWKV-LM-LoRA/RWKV-v4neo/lora/', 'random_seed': -1, 'data_file': '/content/RWKV-LM-LoRA/RWKV-v4neo/data_text_document', 'data_type': 'binidx', 'vocab_size': 65536, 'ctx_len': 1024, 'epoch_steps': 5000, 'epoch_count': 1000, 'epoch_begin': 0, 'epoch_save': 1, 'micro_bsz': 2, 'n_layer': 32, 'n_embd': 4096, 'dim_att': 4096, 'dim_ffn': 16384, 'pre_ffn': 0, 'head_qk': 0, 'tiny_att_dim': 0, 'tiny_att_layer': -999, 'lr_init': 5e-05, 'lr_final': 1e-05, 'warmup_steps': 0, 'beta1': 0.9, 'beta2': 0.999, 'adam_eps': 1e-08, 'grad_cp': 0, 'my_pile_stage': 0, 'my_pile_shift': -1, 'my_pile_edecay': 0, 'layerwise_lr': 1, 'ds_bucket_mb': 200, 'my_img_version': 0, 'my_img_size': 0, 'my_img_bit': 0, 'my_img_clip': 'x', 'my_img_clip_scale': 1, 'my_img_l1_scale': 0, 'my_img_encoder': 'x', 'my_sample_len': 0, 'my_ffn_shift': 1, 'my_att_shift': 1, 'my_pos_emb': 0, 'load_partial': 0, 'magic_prime': 0, 'my_qa_mask': 0, 'my_testing': '', 'lora': True, 'lora_load': '', 'lora_r': 768, 'lora_alpha': 512.0, 'lora_dropout': 0.01, 'lora_parts': 'att,ln,time', 'logger': False, 'enable_checkpointing': False, 'default_root_dir': None, 'gradient_clip_val': 1.0, 'gradient_clip_algorithm': None, 'num_nodes': 1, 'num_processes': None, 'devices': '1', 'gpus': None, 'auto_select_gpus': None, 'tpu_cores': None, 'ipus': None, 'enable_progress_bar': True, 'overfit_batches': 0.0, 'track_grad_norm': -1, 'check_val_every_n_epoch': 100000000000000000000, 'fast_dev_run': False, 'accumulate_grad_batches': 4, 'max_epochs': -1, 'min_epochs': None, 'max_steps': -1, 'min_steps': None, 'max_time': None, 'limit_train_batches': None, 'limit_val_batches': None, 'limit_test_batches': None, 'limit_predict_batches': None, 'val_check_interval': None, 'log_every_n_steps': 100000000000000000000, 'accelerator': 'gpu', 'strategy': 'deepspeed_stage_2', 'sync_batchnorm': False, 'precision': 'bf16', 'enable_model_summary': True, 'num_sanity_val_steps': 0, 'resume_from_checkpoint': None, 'profiler': None, 'benchmark': None, 'reload_dataloaders_every_n_epochs': 0, 'auto_lr_find': False, 'replace_sampler_ddp': False, 'detect_anomaly': False, 'auto_scale_batch_size': False, 'plugins': None, 'amp_backend': None, 'amp_level': None, 'move_metrics_to_cpu': False, 'multiple_trainloader_mode': 'max_size_cycle', 'inference_mode': True, 'my_timestamp': '2023-12-05-05-15-15', 'betas': (0.9, 0.999), 'real_bsz': 2, 'run_name': '65536 ctx1024 L32 D4096'}
{'zero_allow_untested_optimizer': True, 'zero_optimization': {'stage': 2, 'contiguous_gradients': True, 'overlap_comm': True, 'allgather_partitions': True, 'reduce_scatter': True, 'allgather_bucket_size': 200000000, 'reduce_bucket_size': 200000000, 'sub_group_size': 1000000000000}, 'activation_checkpointing': {'partition_activations': False, 'cpu_checkpointing': False, 'contiguous_memory_optimization': False, 'synchronize_checkpoint_boundary': False}, 'aio': {'block_size': 1048576, 'queue_depth': 8, 'single_submit': False, 'overlap_events': True, 'thread_count': 1}, 'gradient_accumulation_steps': 4, 'train_micro_batch_size_per_gpu': 2, 'gradient_clipping': 1.0, 'bf16': {'enabled': True}}
NEW RUN 2023-12-05-05-15-15
{'load_model': '/content/RWKV-LM-LoRA/RWKV-v4neo/RWKV-4-7B-world-one-novel-tuned-65k.pth', 'wandb': '', 'proj_dir': '/content/RWKV-LM-LoRA/RWKV-v4neo/lora/', 'random_seed': -1, 'data_file': '/content/RWKV-LM-LoRA/RWKV-v4neo/data_text_document', 'data_type': 'binidx', 'vocab_size': 65536, 'ctx_len': 1024, 'epoch_steps': 5000, 'epoch_count': 1000, 'epoch_begin': 0, 'epoch_save': 1, 'micro_bsz': 2, 'n_layer': 32, 'n_embd': 4096, 'dim_att': 4096, 'dim_ffn': 16384, 'pre_ffn': 0, 'head_qk': 0, 'tiny_att_dim': 0, 'tiny_att_layer': -999, 'lr_init': 5e-05, 'lr_final': 1e-05, 'warmup_steps': 0, 'beta1': 0.9, 'beta2': 0.999, 'adam_eps': 1e-08, 'grad_cp': 0, 'my_pile_stage': 0, 'my_pile_shift': -1, 'my_pile_edecay': 0, 'layerwise_lr': 1, 'ds_bucket_mb': 200, 'my_img_version': 0, 'my_img_size': 0, 'my_img_bit': 0, 'my_img_clip': 'x', 'my_img_clip_scale': 1, 'my_img_l1_scale': 0, 'my_img_encoder': 'x', 'my_sample_len': 0, 'my_ffn_shift': 1, 'my_att_shift': 1, 'my_pos_emb': 0, 'load_partial': 0, 'magic_prime': 0, 'my_qa_mask': 0, 'my_testing': '', 'lora': True, 'lora_load': '', 'lora_r': 768, 'lora_alpha': 512.0, 'lora_dropout': 0.01, 'lora_parts': 'att,ln,time', 'logger': False, 'enable_checkpointing': False, 'default_root_dir': None, 'gradient_clip_val': 1.0, 'gradient_clip_algorithm': None, 'num_nodes': 1, 'num_processes': None, 'devices': '1', 'gpus': None, 'auto_select_gpus': None, 'tpu_cores': None, 'ipus': None, 'enable_progress_bar': True, 'overfit_batches': 0.0, 'track_grad_norm': -1, 'check_val_every_n_epoch': 100000000000000000000, 'fast_dev_run': False, 'accumulate_grad_batches': 4, 'max_epochs': -1, 'min_epochs': None, 'max_steps': -1, 'min_steps': None, 'max_time': None, 'limit_train_batches': None, 'limit_val_batches': None, 'limit_test_batches': None, 'limit_predict_batches': None, 'val_check_interval': None, 'log_every_n_steps': 100000000000000000000, 'accelerator': 'gpu', 'strategy': 'deepspeed_stage_2', 'sync_batchnorm': False, 'precision': 'bf16', 'enable_model_summary': True, 'num_sanity_val_steps': 0, 'resume_from_checkpoint': None, 'profiler': None, 'benchmark': None, 'reload_dataloaders_every_n_epochs': 0, 'auto_lr_find': False, 'replace_sampler_ddp': False, 'detect_anomaly': False, 'auto_scale_batch_size': False, 'plugins': None, 'amp_backend': None, 'amp_level': None, 'move_metrics_to_cpu': False, 'multiple_trainloader_mode': 'max_size_cycle', 'inference_mode': True, 'my_timestamp': '2023-12-05-05-15-15', 'betas': (0.9, 0.999), 'real_bsz': 2, 'run_name': '65536 ctx1024 L32 D4096'}
{'zero_allow_untested_optimizer': True, 'zero_optimization': {'stage': 2, 'contiguous_gradients': True, 'overlap_comm': True, 'allgather_partitions': True, 'reduce_scatter': True, 'allgather_bucket_size': 200000000, 'reduce_bucket_size': 200000000, 'sub_group_size': 1000000000000}, 'activation_checkpointing': {'partition_activations': False, 'cpu_checkpointing': False, 'contiguous_memory_optimization': False, 'synchronize_checkpoint_boundary': False}, 'aio': {'block_size': 1048576, 'queue_depth': 8, 'single_submit': False, 'overlap_events': True, 'thread_count': 1}, 'gradient_accumulation_steps': 4, 'train_micro_batch_size_per_gpu': 2, 'gradient_clipping': 1.0, 'bf16': {'enabled': True}}
NEW RUN 2023-12-05-05-15-15
{'load_model': '/content/RWKV-LM-LoRA/RWKV-v4neo/RWKV-4-7B-world-one-novel-tuned-65k.pth', 'wandb': '', 'proj_dir': '/content/RWKV-LM-LoRA/RWKV-v4neo/lora/', 'random_seed': -1, 'data_file': '/content/RWKV-LM-LoRA/RWKV-v4neo/data_text_document', 'data_type': 'binidx', 'vocab_size': 65536, 'ctx_len': 1024, 'epoch_steps': 5000, 'epoch_count': 1000, 'epoch_begin': 0, 'epoch_save': 1, 'micro_bsz': 2, 'n_layer': 32, 'n_embd': 4096, 'dim_att': 4096, 'dim_ffn': 16384, 'pre_ffn': 0, 'head_qk': 0, 'tiny_att_dim': 0, 'tiny_att_layer': -999, 'lr_init': 5e-05, 'lr_final': 1e-05, 'warmup_steps': 0, 'beta1': 0.9, 'beta2': 0.999, 'adam_eps': 1e-08, 'grad_cp': 0, 'my_pile_stage': 0, 'my_pile_shift': -1, 'my_pile_edecay': 0, 'layerwise_lr': 1, 'ds_bucket_mb': 200, 'my_img_version': 0, 'my_img_size': 0, 'my_img_bit': 0, 'my_img_clip': 'x', 'my_img_clip_scale': 1, 'my_img_l1_scale': 0, 'my_img_encoder': 'x', 'my_sample_len': 0, 'my_ffn_shift': 1, 'my_att_shift': 1, 'my_pos_emb': 0, 'load_partial': 0, 'magic_prime': 0, 'my_qa_mask': 0, 'my_testing': '', 'lora': True, 'lora_load': '', 'lora_r': 768, 'lora_alpha': 512.0, 'lora_dropout': 0.01, 'lora_parts': 'att,ln,time', 'logger': False, 'enable_checkpointing': False, 'default_root_dir': None, 'gradient_clip_val': 1.0, 'gradient_clip_algorithm': None, 'num_nodes': 1, 'num_processes': None, 'devices': '1', 'gpus': None, 'auto_select_gpus': None, 'tpu_cores': None, 'ipus': None, 'enable_progress_bar': True, 'overfit_batches': 0.0, 'track_grad_norm': -1, 'check_val_every_n_epoch': 100000000000000000000, 'fast_dev_run': False, 'accumulate_grad_batches': 4, 'max_epochs': -1, 'min_epochs': None, 'max_steps': -1, 'min_steps': None, 'max_time': None, 'limit_train_batches': None, 'limit_val_batches': None, 'limit_test_batches': None, 'limit_predict_batches': None, 'val_check_interval': None, 'log_every_n_steps': 100000000000000000000, 'accelerator': 'gpu', 'strategy': 'deepspeed_stage_2', 'sync_batchnorm': False, 'precision': 'bf16', 'enable_model_summary': True, 'num_sanity_val_steps': 0, 'resume_from_checkpoint': None, 'profiler': None, 'benchmark': None, 'reload_dataloaders_every_n_epochs': 0, 'auto_lr_find': False, 'replace_sampler_ddp': False, 'detect_anomaly': False, 'auto_scale_batch_size': False, 'plugins': None, 'amp_backend': None, 'amp_level': None, 'move_metrics_to_cpu': False, 'multiple_trainloader_mode': 'max_size_cycle', 'inference_mode': True, 'my_timestamp': '2023-12-05-05-15-15', 'betas': (0.9, 0.999), 'real_bsz': 2, 'run_name': '65536 ctx1024 L32 D4096'}
{'zero_allow_untested_optimizer': True, 'zero_optimization': {'stage': 2, 'contiguous_gradients': True, 'overlap_comm': True, 'allgather_partitions': True, 'reduce_scatter': True, 'allgather_bucket_size': 200000000, 'reduce_bucket_size': 200000000, 'sub_group_size': 1000000000000}, 'activation_checkpointing': {'partition_activations': False, 'cpu_checkpointing': False, 'contiguous_memory_optimization': False, 'synchronize_checkpoint_boundary': False}, 'aio': {'block_size': 1048576, 'queue_depth': 8, 'single_submit': False, 'overlap_events': True, 'thread_count': 1}, 'gradient_accumulation_steps': 4, 'train_micro_batch_size_per_gpu': 2, 'gradient_clipping': 1.0, 'bf16': {'enabled': True}}
0 2.217211 9.1817 0.00004998 2023-12-05 06:03:21.680230 0
1 2.177034 8.8201 0.00004996 2023-12-05 06:47:52.540017 1
2 2.177449 8.8238 0.00004994 2023-12-05 07:32:23.819467 2
3 2.172328 8.7787 0.00004992 2023-12-05 08:16:55.257359 3
4 2.174535 8.7981 0.00004990 2023-12-05 09:01:25.444398 4
5 2.162517 8.6930 0.00004988 2023-12-05 09:45:54.660611 5
6 2.162742 8.6949 0.00004986 2023-12-05 10:30:25.115663 6
7 2.154770 8.6259 0.00004984 2023-12-05 11:14:54.953275 7
8 2.160273 8.6735 0.00004982 2023-12-05 11:59:24.673533 8
9 2.155170 8.6294 0.00004980 2023-12-05 12:43:56.055237 9
10 2.155403 8.6314 0.00004978 2023-12-05 13:28:27.925363 10
11 2.149700 8.5823 0.00004976 2023-12-05 14:13:00.743157 11
12 2.148756 8.5742 0.00004974 2023-12-05 14:57:33.860255 12
13 2.150493 8.5891 0.00004972 2023-12-05 15:42:05.478916 13
14 2.144194 8.5352 0.00004970 2023-12-05 16:26:37.694615 14
15 2.144805 8.5404 0.00004968 2023-12-05 17:11:10.691345 15
16 2.147698 8.5651 0.00004966 2023-12-05 17:55:44.666483 16
17 2.152827 8.6092 0.00004964 2023-12-05 18:40:18.125535 17
18 2.134348 8.4515 0.00004962 2023-12-05 19:24:50.286459 18
19 2.144168 8.5349 0.00004960 2023-12-05 20:09:22.953973 19
20 2.140627 8.5048 0.00004958 2023-12-05 20:53:54.409335 20
21 2.136445 8.4693 0.00004956 2023-12-05 21:38:25.559175 21
22 2.143549 8.5297 0.00004954 2023-12-05 22:22:57.537193 22
23 2.130694 8.4207 0.00004952 2023-12-05 23:07:28.968411 23
24 2.135613 8.4622 0.00004950 2023-12-05 23:52:00.099832 24