Text Generation
Transformers
PyTorch
rwkv
uncensored
Inference Endpoints
rwkv-14b-wizardlm / train_log.txt
Ian Walton
Initial commit.
541d020
NEW RUN 2023-05-18-02-13-24
{'load_model': './RWKV-4-Pile-14B-20230313-ctx8192-test1050.pth', 'wandb': '', 'proj_dir': './checkpoints-wizardlm', 'random_seed': -1, 'data_file': './train.npy', 'data_type': 'numpy', 'vocab_size': 50277, 'ctx_len': 2048, 'epoch_steps': 1000, 'epoch_count': 17, 'epoch_begin': 0, 'epoch_save': 5, 'micro_bsz': 2, 'n_layer': 40, 'n_embd': 5120, 'dim_att': 5120, 'dim_ffn': 20480, 'pre_ffn': 0, 'head_qk': 0, 'tiny_att_dim': 0, 'tiny_att_layer': -999, 'lr_init': 0.0001, 'lr_final': 5e-07, 'warmup_steps': 0, 'beta1': 0.9, 'beta2': 0.999, 'adam_eps': 1e-08, 'grad_cp': 0, 'my_pile_stage': 0, 'my_pile_shift': -1, 'my_pile_edecay': 0, 'layerwise_lr': 1, 'ds_bucket_mb': 200, 'my_img_version': 0, 'my_img_size': 0, 'my_img_bit': 0, 'my_img_clip': 'x', 'my_img_clip_scale': 1, 'my_img_l1_scale': 0, 'my_img_encoder': 'x', 'my_sample_len': 0, 'my_ffn_shift': 1, 'my_att_shift': 1, 'my_pos_emb': 0, 'load_partial': 0, 'magic_prime': 0, 'my_qa_mask': 0, 'my_testing': '', 'lora': True, 'lora_load': '', 'lora_r': 32, 'lora_alpha': 64.0, 'lora_dropout': 0.05, 'lora_parts': 'att,ffn,time,ln', 'logger': False, 'enable_checkpointing': False, 'default_root_dir': None, 'gradient_clip_val': 1.0, 'gradient_clip_algorithm': None, 'num_nodes': 1, 'num_processes': None, 'devices': '1', 'gpus': None, 'auto_select_gpus': None, 'tpu_cores': None, 'ipus': None, 'enable_progress_bar': True, 'overfit_batches': 0.0, 'track_grad_norm': -1, 'check_val_every_n_epoch': 100000000000000000000, 'fast_dev_run': False, 'accumulate_grad_batches': None, 'max_epochs': -1, 'min_epochs': None, 'max_steps': -1, 'min_steps': None, 'max_time': None, 'limit_train_batches': None, 'limit_val_batches': None, 'limit_test_batches': None, 'limit_predict_batches': None, 'val_check_interval': None, 'log_every_n_steps': 100000000000000000000, 'accelerator': 'gpu', 'strategy': 'deepspeed_stage_2', 'sync_batchnorm': False, 'precision': 'bf16', 'enable_model_summary': True, 'num_sanity_val_steps': 0, 'resume_from_checkpoint': None, 'profiler': None, 'benchmark': None, 'reload_dataloaders_every_n_epochs': 0, 'auto_lr_find': False, 'replace_sampler_ddp': False, 'detect_anomaly': False, 'auto_scale_batch_size': False, 'plugins': None, 'amp_backend': None, 'amp_level': None, 'move_metrics_to_cpu': False, 'multiple_trainloader_mode': 'max_size_cycle', 'inference_mode': True, 'my_timestamp': '2023-05-18-02-13-24', 'betas': (0.9, 0.999), 'real_bsz': 2, 'run_name': '50277 ctx2048 L40 D5120'}
{'zero_allow_untested_optimizer': True, 'zero_optimization': {'stage': 2, 'contiguous_gradients': True, 'overlap_comm': True, 'allgather_partitions': True, 'reduce_scatter': True, 'allgather_bucket_size': 200000000, 'reduce_bucket_size': 200000000, 'sub_group_size': 1000000000000}, 'activation_checkpointing': {'partition_activations': False, 'cpu_checkpointing': False, 'contiguous_memory_optimization': False, 'synchronize_checkpoint_boundary': False}, 'aio': {'block_size': 1048576, 'queue_depth': 8, 'single_submit': False, 'overlap_events': True, 'thread_count': 1}, 'gradient_accumulation_steps': 1, 'train_micro_batch_size_per_gpu': 2, 'gradient_clipping': 1.0, 'bf16': {'enabled': True}, 'compression_training': {'weight_quantization': {'shared_parameters': {}, 'different_groups': {}}, 'activation_quantization': {'shared_parameters': {}, 'different_groups': {}}, 'sparse_pruning': {'shared_parameters': {}, 'different_groups': {}}, 'row_pruning': {'shared_parameters': {}, 'different_groups': {}}, 'head_pruning': {'shared_parameters': {}, 'different_groups': {}}, 'channel_pruning': {'shared_parameters': {}, 'different_groups': {}}}}
NEW RUN 2023-05-18-02-18-22
{'load_model': './RWKV-4-Pile-14B-20230313-ctx8192-test1050.pth', 'wandb': '', 'proj_dir': './checkpoints-wizardlm', 'random_seed': -1, 'data_file': './train.npy', 'data_type': 'numpy', 'vocab_size': 50277, 'ctx_len': 1024, 'epoch_steps': 1000, 'epoch_count': 34, 'epoch_begin': 0, 'epoch_save': 5, 'micro_bsz': 2, 'n_layer': 40, 'n_embd': 5120, 'dim_att': 5120, 'dim_ffn': 20480, 'pre_ffn': 0, 'head_qk': 0, 'tiny_att_dim': 0, 'tiny_att_layer': -999, 'lr_init': 0.0001, 'lr_final': 5e-07, 'warmup_steps': 0, 'beta1': 0.9, 'beta2': 0.999, 'adam_eps': 1e-08, 'grad_cp': 0, 'my_pile_stage': 0, 'my_pile_shift': -1, 'my_pile_edecay': 0, 'layerwise_lr': 1, 'ds_bucket_mb': 200, 'my_img_version': 0, 'my_img_size': 0, 'my_img_bit': 0, 'my_img_clip': 'x', 'my_img_clip_scale': 1, 'my_img_l1_scale': 0, 'my_img_encoder': 'x', 'my_sample_len': 0, 'my_ffn_shift': 1, 'my_att_shift': 1, 'my_pos_emb': 0, 'load_partial': 0, 'magic_prime': 0, 'my_qa_mask': 0, 'my_testing': '', 'lora': True, 'lora_load': '', 'lora_r': 32, 'lora_alpha': 64.0, 'lora_dropout': 0.05, 'lora_parts': 'att,ffn,time,ln', 'logger': False, 'enable_checkpointing': False, 'default_root_dir': None, 'gradient_clip_val': 1.0, 'gradient_clip_algorithm': None, 'num_nodes': 1, 'num_processes': None, 'devices': '1', 'gpus': None, 'auto_select_gpus': None, 'tpu_cores': None, 'ipus': None, 'enable_progress_bar': True, 'overfit_batches': 0.0, 'track_grad_norm': -1, 'check_val_every_n_epoch': 100000000000000000000, 'fast_dev_run': False, 'accumulate_grad_batches': None, 'max_epochs': -1, 'min_epochs': None, 'max_steps': -1, 'min_steps': None, 'max_time': None, 'limit_train_batches': None, 'limit_val_batches': None, 'limit_test_batches': None, 'limit_predict_batches': None, 'val_check_interval': None, 'log_every_n_steps': 100000000000000000000, 'accelerator': 'gpu', 'strategy': 'deepspeed_stage_2', 'sync_batchnorm': False, 'precision': 'bf16', 'enable_model_summary': True, 'num_sanity_val_steps': 0, 'resume_from_checkpoint': None, 'profiler': None, 'benchmark': None, 'reload_dataloaders_every_n_epochs': 0, 'auto_lr_find': False, 'replace_sampler_ddp': False, 'detect_anomaly': False, 'auto_scale_batch_size': False, 'plugins': None, 'amp_backend': None, 'amp_level': None, 'move_metrics_to_cpu': False, 'multiple_trainloader_mode': 'max_size_cycle', 'inference_mode': True, 'my_timestamp': '2023-05-18-02-18-22', 'betas': (0.9, 0.999), 'real_bsz': 2, 'run_name': '50277 ctx1024 L40 D5120'}
{'zero_allow_untested_optimizer': True, 'zero_optimization': {'stage': 2, 'contiguous_gradients': True, 'overlap_comm': True, 'allgather_partitions': True, 'reduce_scatter': True, 'allgather_bucket_size': 200000000, 'reduce_bucket_size': 200000000, 'sub_group_size': 1000000000000}, 'activation_checkpointing': {'partition_activations': False, 'cpu_checkpointing': False, 'contiguous_memory_optimization': False, 'synchronize_checkpoint_boundary': False}, 'aio': {'block_size': 1048576, 'queue_depth': 8, 'single_submit': False, 'overlap_events': True, 'thread_count': 1}, 'gradient_accumulation_steps': 1, 'train_micro_batch_size_per_gpu': 2, 'gradient_clipping': 1.0, 'bf16': {'enabled': True}, 'compression_training': {'weight_quantization': {'shared_parameters': {}, 'different_groups': {}}, 'activation_quantization': {'shared_parameters': {}, 'different_groups': {}}, 'sparse_pruning': {'shared_parameters': {}, 'different_groups': {}}, 'row_pruning': {'shared_parameters': {}, 'different_groups': {}}, 'head_pruning': {'shared_parameters': {}, 'different_groups': {}}, 'channel_pruning': {'shared_parameters': {}, 'different_groups': {}}}}
0 0.974954 2.6510 0.00008557 2023-05-18 02:35:03.067663 0
1 0.924752 2.5212 0.00007322 2023-05-18 02:48:33.964340 1
2 0.910025 2.4844 0.00006266 2023-05-18 03:02:04.389023 2
3 0.896199 2.4503 0.00005362 2023-05-18 03:15:34.393033 3
4 0.891289 2.4383 0.00004588 2023-05-18 03:29:04.556924 4
5 0.887027 2.4279 0.00003926 2023-05-18 03:42:35.363919 5
6 0.879992 2.4109 0.00003359 2023-05-18 03:56:07.111067 6
7 0.873740 2.3959 0.00002875 2023-05-18 04:09:40.671492 7
8 0.876330 2.4021 0.00002460 2023-05-18 04:23:14.937769 8
9 0.859262 2.3614 0.00002105 2023-05-18 04:36:47.992056 9
10 0.857725 2.3578 0.00001801 2023-05-18 04:50:21.573587 10
11 0.854920 2.3512 0.00001541 2023-05-18 05:03:55.170411 11
12 0.853412 2.3476 0.00001319 2023-05-18 05:17:28.057519 12
13 0.841703 2.3203 0.00001129 2023-05-18 05:31:01.106771 13
14 0.854889 2.3511 0.00000966 2023-05-18 05:44:33.658881 14
15 0.850775 2.3415 0.00000826 2023-05-18 05:58:08.160597 15
16 0.851631 2.3435 0.00000707 2023-05-18 06:11:46.384325 16
17 0.839762 2.3158 0.00000605 2023-05-18 06:25:27.382862 17
18 0.850289 2.3403 0.00000518 2023-05-18 06:39:08.960637 18
19 0.841697 2.3203 0.00000443 2023-05-18 06:52:46.666994 19
20 0.839498 2.3152 0.00000379 2023-05-18 07:06:22.116873 20
21 0.842402 2.3219 0.00000324 2023-05-18 07:20:03.309937 21
22 0.830740 2.2950 0.00000278 2023-05-18 07:33:35.949427 22
23 0.838361 2.3126 0.00000238 2023-05-18 07:47:07.854821 23
24 0.843396 2.3242 0.00000203 2023-05-18 08:00:38.640102 24
25 0.833445 2.3012 0.00000174 2023-05-18 08:14:09.935184 25
26 0.835568 2.3061 0.00000149 2023-05-18 08:27:42.156833 26
27 0.842768 2.3228 0.00000127 2023-05-18 08:41:15.406521 27
28 0.840123 2.3167 0.00000109 2023-05-18 08:54:47.893448 28
29 0.834012 2.3025 0.00000093 2023-05-18 09:08:20.361893 29
30 0.833059 2.3003 0.00000080 2023-05-18 09:21:53.894218 30
31 0.838252 2.3123 0.00000068 2023-05-18 09:35:35.929064 31
32 0.834691 2.3041 0.00000058 2023-05-18 09:49:17.445363 32
33 0.849344 2.3381 0.00000050 2023-05-18 10:02:56.926832 33
34 0.832176 2.2983 0.00000050 2023-05-18 10:16:30.915617 34
35 0.833998 2.3025 0.00000050 2023-05-18 10:30:03.204456 35
36 0.844871 2.3277 0.00000050 2023-05-18 10:43:36.046350 36
37 0.844629 2.3271 0.00000050 2023-05-18 10:57:08.279003 37
38 0.840990 2.3187 0.00000050 2023-05-18 11:10:36.765075 38
39 0.835910 2.3069 0.00000050 2023-05-18 11:24:05.882433 39
40 0.836146 2.3075 0.00000050 2023-05-18 11:37:36.168271 40
41 0.835250 2.3054 0.00000050 2023-05-18 11:51:05.636469 41
42 0.833586 2.3016 0.00000050 2023-05-18 12:04:35.482199 42
43 0.838982 2.3140 0.00000050 2023-05-18 12:18:05.848889 43
44 0.837347 2.3102 0.00000050 2023-05-18 12:31:35.710046 44
45 0.842637 2.3225 0.00000050 2023-05-18 12:45:05.453466 45