sanchit-gandhi HF staff commited on
Commit
8ed4a53
1 Parent(s): 50ba432

Upload folder using huggingface_hub

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. .gitattributes +15 -0
  2. finetuning_concatenated_config.json +54 -0
  3. run_parler_tts_training.py +1763 -0
  4. wandb/debug-cli.sanchit.log +0 -0
  5. wandb/debug-internal.log +0 -0
  6. wandb/debug.log +35 -0
  7. wandb/run-20240513_204652-m0g0ap7d/files/conda-environment.yaml +248 -0
  8. wandb/run-20240513_204652-m0g0ap7d/files/config.yaml +86 -0
  9. wandb/run-20240513_204652-m0g0ap7d/files/output.log +180 -0
  10. wandb/run-20240513_204652-m0g0ap7d/files/requirements.txt +225 -0
  11. wandb/run-20240513_204652-m0g0ap7d/files/wandb-metadata.json +706 -0
  12. wandb/run-20240513_204652-m0g0ap7d/files/wandb-summary.json +1 -0
  13. wandb/run-20240513_204652-m0g0ap7d/logs/debug-internal.log +455 -0
  14. wandb/run-20240513_204652-m0g0ap7d/logs/debug.log +29 -0
  15. wandb/run-20240513_204652-m0g0ap7d/run-m0g0ap7d.wandb +0 -0
  16. wandb/run-20240513_205249-qaoje1x9/files/conda-environment.yaml +248 -0
  17. wandb/run-20240513_205249-qaoje1x9/files/config.yaml +88 -0
  18. wandb/run-20240513_205249-qaoje1x9/files/media/audio/Speech samples/eval_1016_193b029d494fd24e7cfa.wav +0 -0
  19. wandb/run-20240513_205249-qaoje1x9/files/media/audio/Speech samples/eval_1016_345bfb6a72849809d361.wav +0 -0
  20. wandb/run-20240513_205249-qaoje1x9/files/media/audio/Speech samples/eval_1016_3c9adbd9374e0fb5ce3d.wav +0 -0
  21. wandb/run-20240513_205249-qaoje1x9/files/media/audio/Speech samples/eval_1016_3cd94e4824cc6c8fb09c.wav +0 -0
  22. wandb/run-20240513_205249-qaoje1x9/files/media/audio/Speech samples/eval_1016_3ed6544e58dd861a5d9e.wav +0 -0
  23. wandb/run-20240513_205249-qaoje1x9/files/media/audio/Speech samples/eval_1016_9da1fed11be9d614d9ec.wav +0 -0
  24. wandb/run-20240513_205249-qaoje1x9/files/media/audio/Speech samples/eval_1016_ec838b0233dbe87d33f3.wav +0 -0
  25. wandb/run-20240513_205249-qaoje1x9/files/media/audio/Speech samples/eval_127_0db946e177a69cbe11f5.wav +0 -0
  26. wandb/run-20240513_205249-qaoje1x9/files/media/audio/Speech samples/eval_127_32c9af8d48e757598000.wav +0 -0
  27. wandb/run-20240513_205249-qaoje1x9/files/media/audio/Speech samples/eval_127_341c52fd92336c009f67.wav +0 -0
  28. wandb/run-20240513_205249-qaoje1x9/files/media/audio/Speech samples/eval_127_43ed5d3749c912acb591.wav +0 -0
  29. wandb/run-20240513_205249-qaoje1x9/files/media/audio/Speech samples/eval_127_75818e76e9e077f058be.wav +0 -0
  30. wandb/run-20240513_205249-qaoje1x9/files/media/audio/Speech samples/eval_127_d24330f3382b9e6ea7ea.wav +0 -0
  31. wandb/run-20240513_205249-qaoje1x9/files/media/audio/Speech samples/eval_127_ec7dcb5421538131ede7.wav +0 -0
  32. wandb/run-20240513_205249-qaoje1x9/files/media/audio/Speech samples/eval_254_2794dcaf322bd12e2814.wav +0 -0
  33. wandb/run-20240513_205249-qaoje1x9/files/media/audio/Speech samples/eval_254_2ef5b33e2eaf98dca4a6.wav +0 -0
  34. wandb/run-20240513_205249-qaoje1x9/files/media/audio/Speech samples/eval_254_4ca836a112634417b82e.wav +0 -0
  35. wandb/run-20240513_205249-qaoje1x9/files/media/audio/Speech samples/eval_254_837a3499e3f93538b643.wav +0 -0
  36. wandb/run-20240513_205249-qaoje1x9/files/media/audio/Speech samples/eval_254_b3650df61e399b05257d.wav +0 -0
  37. wandb/run-20240513_205249-qaoje1x9/files/media/audio/Speech samples/eval_254_d33ccbefe990db0dce2b.wav +0 -0
  38. wandb/run-20240513_205249-qaoje1x9/files/media/audio/Speech samples/eval_254_e8ca5038019cad3cde86.wav +0 -0
  39. wandb/run-20240513_205249-qaoje1x9/files/media/audio/Speech samples/eval_381_4899e0da4615e883ad13.wav +3 -0
  40. wandb/run-20240513_205249-qaoje1x9/files/media/audio/Speech samples/eval_381_492597073098578f0605.wav +3 -0
  41. wandb/run-20240513_205249-qaoje1x9/files/media/audio/Speech samples/eval_381_67d9409a306e3614ec3f.wav +3 -0
  42. wandb/run-20240513_205249-qaoje1x9/files/media/audio/Speech samples/eval_381_7c47ba927ac118ffaacc.wav +3 -0
  43. wandb/run-20240513_205249-qaoje1x9/files/media/audio/Speech samples/eval_381_89ad32d31f3e70178cc1.wav +3 -0
  44. wandb/run-20240513_205249-qaoje1x9/files/media/audio/Speech samples/eval_381_cd644667186ae0518a3c.wav +3 -0
  45. wandb/run-20240513_205249-qaoje1x9/files/media/audio/Speech samples/eval_381_f7405ef7b645b3265477.wav +3 -0
  46. wandb/run-20240513_205249-qaoje1x9/files/media/audio/Speech samples/eval_508_00e9064c0bdbd6b9428d.wav +0 -0
  47. wandb/run-20240513_205249-qaoje1x9/files/media/audio/Speech samples/eval_508_11adac906cb7e2ef30c6.wav +0 -0
  48. wandb/run-20240513_205249-qaoje1x9/files/media/audio/Speech samples/eval_508_5619d97860f92fc1a62d.wav +0 -0
  49. wandb/run-20240513_205249-qaoje1x9/files/media/audio/Speech samples/eval_508_bcb03b95f0470920bdc6.wav +0 -0
  50. wandb/run-20240513_205249-qaoje1x9/files/media/audio/Speech samples/eval_508_e48b4ff2b12d5ffdb11c.wav +0 -0
.gitattributes CHANGED
@@ -33,3 +33,18 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ wandb/run-20240513_205249-qaoje1x9/files/media/audio/Speech[[:space:]]samples/eval_381_4899e0da4615e883ad13.wav filter=lfs diff=lfs merge=lfs -text
37
+ wandb/run-20240513_205249-qaoje1x9/files/media/audio/Speech[[:space:]]samples/eval_381_492597073098578f0605.wav filter=lfs diff=lfs merge=lfs -text
38
+ wandb/run-20240513_205249-qaoje1x9/files/media/audio/Speech[[:space:]]samples/eval_381_67d9409a306e3614ec3f.wav filter=lfs diff=lfs merge=lfs -text
39
+ wandb/run-20240513_205249-qaoje1x9/files/media/audio/Speech[[:space:]]samples/eval_381_7c47ba927ac118ffaacc.wav filter=lfs diff=lfs merge=lfs -text
40
+ wandb/run-20240513_205249-qaoje1x9/files/media/audio/Speech[[:space:]]samples/eval_381_89ad32d31f3e70178cc1.wav filter=lfs diff=lfs merge=lfs -text
41
+ wandb/run-20240513_205249-qaoje1x9/files/media/audio/Speech[[:space:]]samples/eval_381_cd644667186ae0518a3c.wav filter=lfs diff=lfs merge=lfs -text
42
+ wandb/run-20240513_205249-qaoje1x9/files/media/audio/Speech[[:space:]]samples/eval_381_f7405ef7b645b3265477.wav filter=lfs diff=lfs merge=lfs -text
43
+ wandb/run-20240513_205249-qaoje1x9/files/media/audio/Speech[[:space:]]samples/eval_762_09db3d4c95c46a3bae02.wav filter=lfs diff=lfs merge=lfs -text
44
+ wandb/run-20240513_205249-qaoje1x9/files/media/audio/Speech[[:space:]]samples/eval_762_3349ee3cb71e3d789f00.wav filter=lfs diff=lfs merge=lfs -text
45
+ wandb/run-20240513_205249-qaoje1x9/files/media/audio/Speech[[:space:]]samples/eval_762_63964662df5bc7731b7a.wav filter=lfs diff=lfs merge=lfs -text
46
+ wandb/run-20240513_205249-qaoje1x9/files/media/audio/Speech[[:space:]]samples/eval_762_7ab4dededd2d3a052def.wav filter=lfs diff=lfs merge=lfs -text
47
+ wandb/run-20240513_205249-qaoje1x9/files/media/audio/Speech[[:space:]]samples/eval_762_b8be5a72ea66619af007.wav filter=lfs diff=lfs merge=lfs -text
48
+ wandb/run-20240513_205249-qaoje1x9/files/media/audio/Speech[[:space:]]samples/eval_762_d1c21fd700287f9f3109.wav filter=lfs diff=lfs merge=lfs -text
49
+ wandb/run-20240513_205249-qaoje1x9/files/media/audio/Speech[[:space:]]samples/eval_762_e8feadb7228a5b136ce8.wav filter=lfs diff=lfs merge=lfs -text
50
+ wandb/run-20240513_205249-qaoje1x9/run-qaoje1x9.wandb filter=lfs diff=lfs merge=lfs -text
finetuning_concatenated_config.json ADDED
@@ -0,0 +1,54 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "model_name_or_path": "parler-tts/parler_tts_mini_v0.1",
3
+ "feature_extractor_name": "parler-tts/dac_44khZ_8kbps",
4
+ "description_tokenizer_name": "parler-tts/parler_tts_mini_v0.1",
5
+ "prompt_tokenizer_name": "parler-tts/parler_tts_mini_v0.1",
6
+ "report_to": ["wandb"],
7
+ "overwrite_output_dir": true,
8
+ "train_dataset_name": "sanchit-gandhi/expresso-concatenated-half-normal+reach-vb/jenny_tts_dataset+sanchit-gandhi/libritts_r_test+sanchit-gandhi/libritts_r_test",
9
+ "train_metadata_dataset_name": "sanchit-gandhi/expresso-concatenated-half-normal-tags-mistral+ylacombe/jenny-tts-10k-tagged+parler-tts/libritts_r_tags_tagged_10k_generated+parler-tts/libritts_r_tags_tagged_10k_generated",
10
+ "train_dataset_config_name": "read+default+clean+other",
11
+ "train_split_name": "train+train[:20%]+test.clean+test.other",
12
+ "eval_dataset_name": "sanchit-gandhi/expresso-concatenated-half-normal+reach-vb/jenny_tts_dataset+sanchit-gandhi/libritts_r_test+sanchit-gandhi/libritts_r_test",
13
+ "eval_metadata_dataset_name": "sanchit-gandhi/expresso-concatenated-half-normal-tags-mistral+ylacombe/jenny-tts-10k-tagged+parler-tts/libritts_r_tags_tagged_10k_generated+parler-tts/libritts_r_tags_tagged_10k_generated",
14
+ "eval_dataset_config_name": "read+default+clean+other",
15
+ "eval_split_name": "train+train[:20%]+test.clean+test.other",
16
+ "max_eval_samples": 8,
17
+ "per_device_eval_batch_size": 16,
18
+ "target_audio_column_name": "audio",
19
+ "description_column_name": "text_description",
20
+ "prompt_column_name": "text",
21
+ "max_duration_in_seconds": 30.0,
22
+ "min_duration_in_seconds": 2.0,
23
+ "max_text_length": 400,
24
+ "preprocessing_num_workers": 2,
25
+ "do_train": true,
26
+ "num_train_epochs": 8,
27
+ "max_steps": -1,
28
+ "gradient_accumulation_steps": 8,
29
+ "gradient_checkpointing": true,
30
+ "per_device_train_batch_size": 16,
31
+ "learning_rate": 0.00008,
32
+ "adam_beta1": 0.9,
33
+ "adam_beta2": 0.99,
34
+ "weight_decay": 0.01,
35
+ "lr_scheduler_type": "cosine",
36
+ "warmup_steps": 250,
37
+ "logging_steps": 5,
38
+ "freeze_text_encoder": true,
39
+ "audio_encoder_per_device_batch_size": 4,
40
+ "dtype": "bfloat16",
41
+ "seed": 456,
42
+ "output_dir": "../output_dir_training_constant_concat/",
43
+ "temporary_save_to_disk": "../audio_code_tmp_constant_concat/",
44
+ "save_to_disk": "../tmp_dataset_audio_constant_concat/",
45
+ "dataloader_num_workers": 4,
46
+ "do_eval": true,
47
+ "predict_with_generate": true,
48
+ "include_inputs_for_metrics": true,
49
+ "save_strategy": "epoch",
50
+ "evaluation_strategy": "epoch",
51
+ "save_total_limit": 5,
52
+ "group_by_length": true
53
+ }
54
+
run_parler_tts_training.py ADDED
@@ -0,0 +1,1763 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python
2
+ # coding=utf-8
3
+ # Copyright 2024 The HuggingFace Inc. team. All rights reserved.
4
+ #
5
+ # Licensed under the Apache License, Version 2.0 (the "License");
6
+ # you may not use this file except in compliance with the License.
7
+ # You may obtain a copy of the License at
8
+ #
9
+ # http://www.apache.org/licenses/LICENSE-2.0
10
+ #
11
+ # Unless required by applicable law or agreed to in writing, software
12
+ # distributed under the License is distributed on an "AS IS" BASIS,
13
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14
+ # See the License for the specific language governing permissions and
15
+ # limitations under the License.
16
+
17
+ """ Train Parler-TTS using 🤗 Accelerate"""
18
+
19
+ import logging
20
+ import os
21
+ import re
22
+ import shutil
23
+ import sys
24
+ import time
25
+ from dataclasses import dataclass, field
26
+ from datetime import timedelta
27
+ from pathlib import Path
28
+ from typing import Dict, List, Optional, Set, Union
29
+
30
+ import datasets
31
+ import evaluate
32
+ import numpy as np
33
+ import torch
34
+ import transformers
35
+ from accelerate import Accelerator
36
+ from accelerate.utils import AutocastKwargs, InitProcessGroupKwargs, TorchDynamoPlugin, set_seed
37
+ from accelerate.utils.memory import release_memory
38
+ from datasets import Dataset, DatasetDict, IterableDataset, concatenate_datasets, interleave_datasets, load_dataset
39
+ from huggingface_hub import Repository, create_repo
40
+ from multiprocess import set_start_method
41
+ from torch.utils.data import DataLoader
42
+ from tqdm import tqdm
43
+ from transformers import (
44
+ AutoFeatureExtractor,
45
+ AutoModel,
46
+ AutoProcessor,
47
+ AutoTokenizer,
48
+ HfArgumentParser,
49
+ Seq2SeqTrainingArguments,
50
+ pipeline,
51
+ )
52
+ from transformers.optimization import get_scheduler
53
+ from transformers.trainer_pt_utils import LengthGroupedSampler
54
+ from transformers.utils import send_example_telemetry
55
+ from wandb import Audio
56
+
57
+ from parler_tts import (
58
+ ParlerTTSConfig,
59
+ ParlerTTSForConditionalGeneration,
60
+ build_delay_pattern_mask,
61
+ )
62
+
63
+
64
+ logger = logging.getLogger(__name__)
65
+
66
+
67
+ def list_field(default=None, metadata=None):
68
+ return field(default_factory=lambda: default, metadata=metadata)
69
+
70
+
71
+ _RE_CHECKPOINT = re.compile(r"^checkpoint-(\d+)-epoch-(\d+)$")
72
+
73
+
74
+ def get_last_checkpoint(folder):
75
+ content = os.listdir(folder)
76
+ checkpoints = [
77
+ path
78
+ for path in content
79
+ if _RE_CHECKPOINT.search(path) is not None and os.path.isdir(os.path.join(folder, path))
80
+ ]
81
+ if len(checkpoints) == 0:
82
+ return
83
+ return os.path.join(folder, max(checkpoints, key=lambda x: int(_RE_CHECKPOINT.search(x).groups()[0])))
84
+
85
+
86
+ def sorted_checkpoints(output_dir=None, checkpoint_prefix="checkpoint") -> List[str]:
87
+ """Helper function to sort saved checkpoints from oldest to newest."""
88
+ ordering_and_checkpoint_path = []
89
+
90
+ glob_checkpoints = [str(x) for x in Path(output_dir).glob(f"{checkpoint_prefix}-*") if os.path.isdir(x)]
91
+
92
+ for path in glob_checkpoints:
93
+ regex_match = re.match(f".*{checkpoint_prefix}-([0-9]+)", path)
94
+ if regex_match is not None and regex_match.groups() is not None:
95
+ ordering_and_checkpoint_path.append((int(regex_match.groups()[0]), path))
96
+
97
+ checkpoints_sorted = sorted(ordering_and_checkpoint_path)
98
+ checkpoints_sorted = [checkpoint[1] for checkpoint in checkpoints_sorted]
99
+ return checkpoints_sorted
100
+
101
+
102
+ def rotate_checkpoints(save_total_limit=None, output_dir=None, checkpoint_prefix="checkpoint") -> None:
103
+ """Helper function to delete old checkpoints."""
104
+ if save_total_limit is None or save_total_limit <= 0:
105
+ return
106
+ # Check if we should delete older checkpoint(s)
107
+ checkpoints_sorted = sorted_checkpoints(output_dir=output_dir, checkpoint_prefix=checkpoint_prefix)
108
+ if len(checkpoints_sorted) <= save_total_limit:
109
+ return
110
+
111
+ number_of_checkpoints_to_delete = max(0, len(checkpoints_sorted) - save_total_limit)
112
+ checkpoints_to_be_deleted = checkpoints_sorted[:number_of_checkpoints_to_delete]
113
+ for checkpoint in checkpoints_to_be_deleted:
114
+ logger.info(f"Deleting older checkpoint [{checkpoint}] due to args.save_total_limit")
115
+ shutil.rmtree(checkpoint, ignore_errors=True)
116
+
117
+
118
+ def log_metric(
119
+ accelerator,
120
+ metrics: Dict,
121
+ train_time: float,
122
+ step: int,
123
+ epoch: int,
124
+ learning_rate: float = None,
125
+ prefix: str = "train",
126
+ ):
127
+ """Helper function to log all training/evaluation metrics with the correct prefixes and styling."""
128
+ log_metrics = {}
129
+ for k, v in metrics.items():
130
+ log_metrics[f"{prefix}/{k}"] = v
131
+ log_metrics[f"{prefix}/time"] = train_time
132
+ log_metrics[f"{prefix}/epoch"] = epoch
133
+ if learning_rate is not None:
134
+ log_metrics[f"{prefix}/learning_rate"] = learning_rate
135
+ accelerator.log(log_metrics, step=step)
136
+
137
+
138
+ def log_pred(
139
+ accelerator,
140
+ pred_descriptions: List[str],
141
+ pred_prompts: List[str],
142
+ transcriptions: List[str],
143
+ audios: List[torch.Tensor],
144
+ sampling_rate: int,
145
+ step: int,
146
+ prefix: str = "eval",
147
+ num_lines: int = 200000,
148
+ ):
149
+ """Helper function to log target/predicted transcriptions to weights and biases (wandb)."""
150
+ if accelerator.is_main_process:
151
+ wandb_tracker = accelerator.get_tracker("wandb")
152
+ # pretty name for current step: step 50000 -> step 50k
153
+ cur_step_pretty = f"{int(step // 1000)}k" if step > 1000 else step
154
+ prefix_pretty = prefix.replace("/", "-")
155
+
156
+ # convert str data to a wandb compatible format
157
+ str_data = [[pred_descriptions[i], pred_prompts[i], transcriptions[i]] for i in range(len(pred_descriptions))]
158
+ # log as a table with the appropriate headers
159
+ wandb_tracker.log_table(
160
+ table_name=f"predictions/{prefix_pretty}-step-{cur_step_pretty}",
161
+ columns=["Target descriptions", "Target prompts", "Predicted transcriptions"],
162
+ data=str_data[:num_lines],
163
+ step=step,
164
+ commit=False,
165
+ )
166
+
167
+ # wandb can only loads 100 audios per step
168
+ wandb_tracker.log(
169
+ {
170
+ f"Speech samples/{prefix}": [
171
+ Audio(
172
+ audio,
173
+ caption=f"{pred_prompts[i]} --- DESCRIPTION: {pred_descriptions[i]}",
174
+ sample_rate=sampling_rate,
175
+ )
176
+ for (i, audio) in enumerate(audios[: min(len(audios), 100)])
177
+ ]
178
+ },
179
+ step=step,
180
+ )
181
+
182
+
183
+ @dataclass
184
+ class ModelArguments:
185
+ """
186
+ Arguments pertaining to which model/config/tokenizer we are going to fine-tune from.
187
+ """
188
+
189
+ model_name_or_path: str = field(
190
+ metadata={"help": "Path to pretrained model or model identifier from huggingface.co/models"}
191
+ )
192
+ config_name: Optional[str] = field(
193
+ default=None, metadata={"help": "Pretrained config name or path if not the same as model_name"}
194
+ )
195
+ feature_extractor_name: Optional[str] = field(
196
+ default=None, metadata={"help": "Pretrained feature extractor name or path if not the same as model_name"}
197
+ )
198
+ description_tokenizer_name: Optional[str] = field(
199
+ default=None, metadata={"help": "Pretrained description tokenizer name or path if not the same as model_name"}
200
+ )
201
+ prompt_tokenizer_name: Optional[str] = field(
202
+ default=None,
203
+ metadata={"help": "Pretrained prompt tokenizer name or path if not the same as description_tokenizer_name"},
204
+ )
205
+ cache_dir: Optional[str] = field(
206
+ default=None,
207
+ metadata={"help": "Where to store the pretrained models downloaded from huggingface.co"},
208
+ )
209
+ use_fast_tokenizer: bool = field(
210
+ default=True,
211
+ metadata={"help": "Whether to use one of the fast tokenizer (backed by the tokenizers library) or not."},
212
+ )
213
+ model_revision: str = field(
214
+ default="main",
215
+ metadata={"help": "The specific model version to use (can be a branch name, tag name or commit id)."},
216
+ )
217
+ pad_token_id: int = field(
218
+ default=None,
219
+ metadata={"help": "If specified, change the model pad token id."},
220
+ )
221
+ decoder_start_token_id: int = field(
222
+ default=None,
223
+ metadata={"help": "If specified, change the model decoder start token id."},
224
+ )
225
+ freeze_text_encoder: bool = field(
226
+ default=False,
227
+ metadata={"help": "Whether to freeze the text encoder."},
228
+ )
229
+ do_sample: bool = field(
230
+ default=True,
231
+ metadata={"help": "Whether to do sampling or greedy decoding."},
232
+ )
233
+ temperature: float = field(
234
+ default=1.0,
235
+ metadata={"help": "Temperature if sampling."},
236
+ )
237
+ max_length: int = field(
238
+ default=2580,
239
+ metadata={"help": "Generation max length."},
240
+ )
241
+ bandwidth: float = field(
242
+ default=6,
243
+ metadata={"help": "Audio encoder bandwidth."},
244
+ )
245
+ asr_model_name_or_path: str = field(
246
+ default="distil-whisper/distil-large-v2",
247
+ metadata={
248
+ "help": "Used to compute WER during evaluation. Path to pretrained model or model identifier from huggingface.co/models"
249
+ },
250
+ )
251
+ clap_model_name_or_path: str = field(
252
+ default="laion/larger_clap_music_and_speech",
253
+ metadata={
254
+ "help": "Used to compute audio similarity during evaluation. Path to pretrained model or model identifier from huggingface.co/models"
255
+ },
256
+ )
257
+
258
+
259
+ @dataclass
260
+ class DataTrainingArguments:
261
+ """
262
+ Arguments pertaining to what data we are going to input our model for training and eval.
263
+
264
+ Using `HfArgumentParser` we can turn this class
265
+ into argparse arguments to be able to specify them on
266
+ the command line.
267
+ """
268
+
269
+ train_dataset_name: str = field(
270
+ default=None,
271
+ metadata={
272
+ "help": "The name of the training dataset to use (via the datasets library). Load and combine "
273
+ "multiple datasets by separating dataset ids by a '+' symbol. For example, to load and combine "
274
+ " librispeech and common voice, set `train_dataset_name='librispeech_asr+common_voice'`."
275
+ },
276
+ )
277
+ train_dataset_config_name: Optional[str] = field(
278
+ default=None,
279
+ metadata={
280
+ "help": "The configuration name of the training dataset to use (via the datasets library). Load and combine "
281
+ "multiple datasets by separating dataset configs by a '+' symbol."
282
+ },
283
+ )
284
+ train_split_name: str = field(
285
+ default="train",
286
+ metadata={
287
+ "help": ("The name of the training data set split to use (via the datasets library). Defaults to 'train'")
288
+ },
289
+ )
290
+ train_dataset_samples: str = field(
291
+ default=None,
292
+ metadata={
293
+ "help": "Number of samples in the training data. Load and combine "
294
+ "multiple datasets by separating dataset samples by a '+' symbol."
295
+ },
296
+ )
297
+ train_metadata_dataset_name: str = field(
298
+ default=None,
299
+ metadata={
300
+ "help": "The name of the metadata training dataset to use (via the datasets library). Load and combine "
301
+ "multiple datasets by separating dataset ids by a '+' symbol. For example, to load and combine "
302
+ " librispeech and common voice, set `train_dataset_name='librispeech_asr+common_voice'`."
303
+ },
304
+ )
305
+ eval_dataset_name: str = field(
306
+ default=None,
307
+ metadata={
308
+ "help": "The name of the evaluation dataset to use (via the datasets library). Defaults to the training dataset name if unspecified."
309
+ },
310
+ )
311
+ eval_dataset_config_name: Optional[str] = field(
312
+ default=None,
313
+ metadata={
314
+ "help": "The configuration name of the evaluation dataset to use (via the datasets library). Defaults to the training dataset config name if unspecified"
315
+ },
316
+ )
317
+ eval_split_name: str = field(
318
+ default="test",
319
+ metadata={
320
+ "help": "The name of the evaluation data set split to use (via the datasets library). Defaults to 'test'"
321
+ },
322
+ )
323
+ eval_metadata_dataset_name: str = field(
324
+ default=None,
325
+ metadata={
326
+ "help": "The name of the metadata training dataset to use (via the datasets library). Load and combine "
327
+ "multiple datasets by separating dataset ids by a '+' symbol. For example, to load and combine "
328
+ " librispeech and common voice, set `train_dataset_name='librispeech_asr+common_voice'`."
329
+ },
330
+ )
331
+ target_audio_column_name: str = field(
332
+ default="audio",
333
+ metadata={"help": "The name of the dataset column containing the target audio data. Defaults to 'audio'"},
334
+ )
335
+ description_column_name: str = field(
336
+ default=None,
337
+ metadata={"help": "The name of the dataset column containing the description text data. Defaults to 'None'."},
338
+ )
339
+ prompt_column_name: str = field(
340
+ default=None,
341
+ metadata={"help": "The name of the dataset column containing the prompt text data. Defaults to 'None'."},
342
+ )
343
+ overwrite_cache: bool = field(
344
+ default=False, metadata={"help": "Overwrite the cached preprocessed datasets or not."}
345
+ )
346
+ preprocessing_num_workers: Optional[int] = field(
347
+ default=None,
348
+ metadata={"help": "The number of processes to use for the preprocessing."},
349
+ )
350
+ max_train_samples: Optional[int] = field(
351
+ default=None,
352
+ metadata={
353
+ "help": (
354
+ "For debugging purposes or quicker training, truncate the number of training examples to this "
355
+ "value if set."
356
+ )
357
+ },
358
+ )
359
+ max_eval_samples: Optional[int] = field(
360
+ default=None,
361
+ metadata={
362
+ "help": (
363
+ "For debugging purposes or quicker training, truncate the number of validation examples to this "
364
+ "value if set."
365
+ )
366
+ },
367
+ )
368
+ max_duration_in_seconds: float = field(
369
+ default=35.0,
370
+ metadata={
371
+ "help": (
372
+ "Filter audio files that are longer than `max_duration_in_seconds` seconds to 'max_duration_in_seconds`."
373
+ "Also, used to set maximum audio length if `pad_to_max_length=True`."
374
+ )
375
+ },
376
+ )
377
+ min_duration_in_seconds: float = field(
378
+ default=0.0, metadata={"help": "Filter audio files that are shorter than `min_duration_in_seconds` seconds"}
379
+ )
380
+ max_text_length: int = field(
381
+ default=500, metadata={"help": "If set, max description lengths in number of characters."}
382
+ )
383
+ max_prompt_token_length: int = field(
384
+ default=None,
385
+ metadata={
386
+ "help": (
387
+ "If set, filter samples with prompts that are longer than `max_prompt_token_length` tokens."
388
+ "Also, used to set maximum prompt token length if `pad_to_max_length=True`."
389
+ )
390
+ },
391
+ )
392
+ max_description_token_length: int = field(
393
+ default=None,
394
+ metadata={
395
+ "help": (
396
+ "If set, filter samples with descriptions that are longer than `max_description_token_length` tokens."
397
+ "Also, used to set maximum desription token length if `pad_to_max_length=True`."
398
+ )
399
+ },
400
+ )
401
+ pad_to_max_length: bool = field(
402
+ default=False,
403
+ metadata={
404
+ "help": (
405
+ "If `True`, pad audio, prompt and description to a maximum length set with respectively "
406
+ "`max_duration_in_seconds`, `max_prompt_token_length`, `max_description_token_length`."
407
+ )
408
+ },
409
+ )
410
+ preprocessing_only: bool = field(
411
+ default=False,
412
+ metadata={
413
+ "help": (
414
+ "Whether to only do data preprocessing and skip training. This is especially useful when data"
415
+ " preprocessing errors out in distributed training due to timeout. In this case, one should run the"
416
+ " preprocessing in a non-distributed setup with `preprocessing_only=True` so that the cached datasets"
417
+ " can consequently be loaded in distributed training."
418
+ " In this training script, `save_to_disk` must be set to the path in which the dataset should be saved. "
419
+ )
420
+ },
421
+ )
422
+ token: str = field(
423
+ default=None,
424
+ metadata={
425
+ "help": (
426
+ "The token to use as HTTP bearer authorization for remote files. If not specified, will use the token "
427
+ "generated when running `huggingface-cli login` (stored in `~/.huggingface`)."
428
+ )
429
+ },
430
+ )
431
+ use_auth_token: bool = field(
432
+ default=None,
433
+ metadata={
434
+ "help": "The `use_auth_token` argument is deprecated and will be removed in v4.34. Please use `token` instead."
435
+ },
436
+ )
437
+ trust_remote_code: bool = field(
438
+ default=False,
439
+ metadata={
440
+ "help": (
441
+ "Whether or not to allow for custom models defined on the Hub in their own modeling files. This option "
442
+ "should only be set to `True` for repositories you trust and in which you have read the code, as it will "
443
+ "execute code present on the Hub on your local machine."
444
+ )
445
+ },
446
+ )
447
+ add_audio_samples_to_wandb: bool = field(
448
+ default=False,
449
+ metadata={"help": "If set and if `wandb` in args.report_to, will add generated audio samples to wandb logs."},
450
+ )
451
+ id_column_name: str = field(default=None, metadata={"help": "id column name."})
452
+ wandb_project: str = field(
453
+ default="parler-speech",
454
+ metadata={"help": "The name of the wandb project."},
455
+ )
456
+ save_to_disk: str = field(
457
+ default=None,
458
+ metadata={
459
+ "help": "If set, will save the dataset to this path if this is an empyt folder. If not empty, will load the datasets from it."
460
+ },
461
+ )
462
+ temporary_save_to_disk: str = field(default=None, metadata={"help": "Temporarily save audio labels here."})
463
+ pad_to_multiple_of: Optional[int] = field(
464
+ default=2,
465
+ metadata={"help": ("Pad to multiple of for tokenizers.")},
466
+ )
467
+
468
+
469
+ @dataclass
470
+ class ParlerTTSTrainingArguments(Seq2SeqTrainingArguments):
471
+ dtype: Optional[str] = field(
472
+ default="float32",
473
+ metadata={
474
+ "help": (
475
+ "The data type (dtype) in which to run training. One of `float32` (full-precision), "
476
+ "`float16` or `bfloat16` (both half-precision)."
477
+ )
478
+ },
479
+ )
480
+ audio_encoder_per_device_batch_size: int = field(
481
+ default=8,
482
+ metadata={"help": ("Specify the batch size of the audio encoding pre-processing steps.")},
483
+ )
484
+
485
+
486
+ @dataclass
487
+ class DataCollatorEncodecWithPadding:
488
+ """
489
+ Data collator that will dynamically pad the inputs received to the longest sequence in the batch or
490
+ to `max_length` if `max_length` is set and `padding=max_length`.
491
+ """
492
+
493
+ feature_extractor: AutoFeatureExtractor
494
+ audio_column_name: str
495
+ feature_extractor_input_name: Optional[str] = "input_values"
496
+ max_length: Optional[int] = None
497
+ padding: Optional[str] = "longest"
498
+
499
+ def __call__(self, features: List[Dict[str, Union[List[int], torch.Tensor]]]) -> Dict[str, torch.Tensor]:
500
+ # split inputs and labels since they have to be of different lengths and need
501
+ # different padding methods
502
+ audios = [feature[self.audio_column_name]["array"] for feature in features]
503
+ len_audio = [len(audio) for audio in audios]
504
+
505
+ batch = self.feature_extractor(audios, return_tensors="pt", padding=self.padding, max_length=self.max_length, sampling_rate=self.feature_extractor.sampling_rate)
506
+ batch["len_audio"] = torch.tensor(len_audio).unsqueeze(1)
507
+ return batch
508
+
509
+
510
+ @dataclass
511
+ class DataCollatorParlerTTSWithPadding:
512
+ """
513
+ Data collator that will dynamically pad the inputs received.
514
+ Args:
515
+ prompt_tokenizer (:class:`~transformers.AutoTokenizer`)
516
+ The prompt_tokenizer used for proccessing the data.
517
+ description_tokenizer (:class:`~transformers.AutoTokenizer`)
518
+ The description_tokenizer used for proccessing the data.
519
+ padding (:obj:`bool`, :obj:`str` or :class:`~transformers.tokenization_utils_base.PaddingStrategy`, `optional`, defaults to :obj:`True`):
520
+ Select a strategy to pad the returned sequences (according to the model's padding side and padding index)
521
+ among:
522
+ * :obj:`True` or :obj:`'longest'`: Pad to the longest sequence in the batch (or no padding if only a single
523
+ sequence if provided).
524
+ * :obj:`'max_length'`: Pad to a maximum length specified with the argument :obj:`max_length` or to the
525
+ maximum acceptable input length for the model if that argument is not provided.
526
+ * :obj:`False` or :obj:`'do_not_pad'` (default): No padding (i.e., can output a batch with sequences of
527
+ different lengths).
528
+ pad_to_multiple_of (:obj:`int`, `optional`):
529
+ If set will pad the sequence to a multiple of the provided value.
530
+ This is especially useful to enable the use of Tensor Cores on NVIDIA hardware with compute capability >=
531
+ 7.5 (Volta).
532
+ """
533
+
534
+ prompt_tokenizer: AutoTokenizer
535
+ description_tokenizer: AutoTokenizer
536
+ padding: Union[bool, str] = "longest"
537
+ pad_to_multiple_of: Optional[int] = None
538
+ prompt_max_length: Optional[int] = None
539
+ description_max_length: Optional[int] = None
540
+ audio_max_length: Optional[int] = None
541
+
542
+ def __call__(self, features: List[Dict[str, Union[List[int], torch.Tensor]]]) -> Dict[str, torch.Tensor]:
543
+ # split inputs and labels since they have to be of different lengths and need
544
+ # different padding methods
545
+
546
+ labels = [torch.tensor(feature["labels"]).transpose(0, 1) for feature in features]
547
+ # (bsz, seq_len, num_codebooks)
548
+ labels = torch.nn.utils.rnn.pad_sequence(labels, batch_first=True, padding_value=-100)
549
+ if self.audio_max_length is not None and self.padding == "max_length":
550
+ labels = torch.nn.functional.pad(labels, pad=(0, 0, 0, max(self.audio_max_length - labels.shape[1], 0)))
551
+
552
+ input_ids = [{"input_ids": feature["input_ids"]} for feature in features]
553
+
554
+ input_ids = self.description_tokenizer.pad(
555
+ input_ids,
556
+ return_tensors="pt",
557
+ padding=self.padding,
558
+ pad_to_multiple_of=self.pad_to_multiple_of,
559
+ max_length=self.description_max_length,
560
+ )
561
+
562
+ batch = {"labels": labels, **input_ids}
563
+
564
+ if self.audio_max_length is not None and self.padding == "max_length":
565
+ # if we do torch.compile, we need to also specify the attention_mask
566
+ decoder_attention_mask = torch.ones(labels.shape[:2], dtype=input_ids["attention_mask"].dtype)
567
+ batch["decoder_attention_mask"] = decoder_attention_mask
568
+
569
+ prompt_input_ids = [{"input_ids": feature["prompt_input_ids"]} for feature in features]
570
+ prompt_input_ids = self.prompt_tokenizer.pad(
571
+ prompt_input_ids,
572
+ return_tensors="pt",
573
+ padding=self.padding,
574
+ pad_to_multiple_of=self.pad_to_multiple_of,
575
+ max_length=self.prompt_max_length,
576
+ )
577
+
578
+ batch["prompt_input_ids"] = prompt_input_ids["input_ids"]
579
+ if "attention_mask" in prompt_input_ids:
580
+ batch["prompt_attention_mask"] = prompt_input_ids["attention_mask"]
581
+
582
+ return batch
583
+
584
+
585
+ def convert_dataset_str_to_list(
586
+ dataset_names,
587
+ dataset_config_names,
588
+ metadata_dataset_names=None,
589
+ splits=None,
590
+ dataset_samples=None,
591
+ default_split="train",
592
+ ):
593
+ if isinstance(dataset_names, str):
594
+ dataset_names = dataset_names.split("+")
595
+ dataset_config_names = dataset_config_names.split("+")
596
+ splits = splits.split("+") if splits is not None else None
597
+ dataset_samples = dataset_samples.split("+") if dataset_samples is not None else None
598
+ metadata_dataset_names = metadata_dataset_names.split("+") if metadata_dataset_names is not None else None
599
+
600
+ # basic checks to ensure we've got the right number of datasets/configs/splits/columns/probs
601
+ if len(dataset_names) != len(dataset_config_names):
602
+ raise ValueError(
603
+ f"Ensure one config is passed for each dataset, got {len(dataset_names)} datasets and"
604
+ f" {len(dataset_config_names)} configs."
605
+ )
606
+
607
+ if splits is not None and len(splits) != len(dataset_names):
608
+ raise ValueError(
609
+ f"Ensure one split is passed for each dataset, got {len(dataset_names)} datasets and {len(splits)} splits."
610
+ )
611
+
612
+ if metadata_dataset_names is not None and len(metadata_dataset_names) != len(dataset_names):
613
+ raise ValueError(
614
+ f"Ensure one metadata dataset is passed for each dataset, got {len(dataset_names)} datasets and {len(metadata_dataset_names)} metadata datasets."
615
+ )
616
+
617
+ if dataset_samples is not None:
618
+ if len(dataset_samples) != len(dataset_names):
619
+ raise ValueError(
620
+ f"Ensure one sample is passed for each dataset, got {len(dataset_names)} datasets and "
621
+ f"{len(dataset_samples)} samples."
622
+ )
623
+ dataset_samples = [float(ds_sample) for ds_sample in dataset_samples]
624
+ else:
625
+ dataset_samples = [None] * len(dataset_names)
626
+
627
+ splits = splits if splits is not None else [default_split for _ in range(len(dataset_names))]
628
+
629
+ dataset_names_dict = []
630
+ for i, ds_name in enumerate(dataset_names):
631
+ dataset_names_dict.append(
632
+ {
633
+ "name": ds_name,
634
+ "config": dataset_config_names[i],
635
+ "split": splits[i],
636
+ "metadata_dataset_name": metadata_dataset_names[i],
637
+ "samples": dataset_samples[i],
638
+ }
639
+ )
640
+ return dataset_names_dict
641
+
642
+
643
+ def load_multiple_datasets(
644
+ accelerator: Accelerator,
645
+ dataset_names: Union[List, str],
646
+ dataset_config_names: Union[List, str],
647
+ metadata_dataset_names: Optional[str] = None,
648
+ splits: Optional[Union[List, str]] = None,
649
+ label_column_names: Optional[List] = None,
650
+ stopping_strategy: Optional[str] = "first_exhausted",
651
+ dataset_samples: Optional[Union[List, np.array]] = None,
652
+ streaming: Optional[bool] = False,
653
+ seed: Optional[int] = None,
654
+ id_column_name: Optional[str] = None,
655
+ columns_to_keep: Optional[Set[str]] = None,
656
+ prompt_column_name: Optional[str] = None,
657
+ sampling_rate: Optional[int] = None,
658
+ audio_column_name: Optional[str] = None,
659
+ **kwargs,
660
+ ) -> Union[Dataset, IterableDataset]:
661
+ dataset_names_dict = convert_dataset_str_to_list(
662
+ dataset_names, dataset_config_names, metadata_dataset_names, splits, label_column_names, dataset_samples
663
+ )
664
+
665
+ if dataset_samples is not None:
666
+ dataset_samples = [ds_dict["samples"] for ds_dict in dataset_names_dict]
667
+ probabilities = np.array(dataset_samples) / np.sum(dataset_samples)
668
+ else:
669
+ probabilities = None
670
+
671
+ all_datasets = []
672
+ # iterate over the datasets we want to interleave
673
+ for dataset_dict in tqdm(dataset_names_dict, desc="Combining datasets..."):
674
+ with accelerator.main_process_first():
675
+ dataset = load_dataset(
676
+ dataset_dict["name"],
677
+ dataset_dict["config"],
678
+ split=dataset_dict["split"],
679
+ streaming=streaming,
680
+ **kwargs,
681
+ )
682
+ dataset_features = dataset.features.keys()
683
+
684
+ if sampling_rate is not None and audio_column_name is not None:
685
+ # resample target audio
686
+ dataset = dataset.cast_column(audio_column_name, datasets.features.Audio(sampling_rate=sampling_rate))
687
+
688
+ metadata_dataset_name = dataset_dict["metadata_dataset_name"]
689
+ if metadata_dataset_name is not None:
690
+ logger.info(
691
+ f'Merging {dataset_dict["name"]} - {dataset_dict["split"]} with {metadata_dataset_name} - {dataset_dict["split"]}'
692
+ )
693
+ metadata_dataset = load_dataset(
694
+ metadata_dataset_name,
695
+ dataset_dict["config"],
696
+ split=dataset_dict["split"],
697
+ streaming=streaming,
698
+ **kwargs,
699
+ )
700
+
701
+ # TODO(YL): I forgot to create unique ids for MLS english.
702
+ # To iterate faster, I bypass the original id check and do another one. - Done once because assuming it won't change next time
703
+ # if dataset_dict["name"] == "parler-tts/mls_eng_10k":
704
+ # def concat_ids(book_id, speaker_id, begin_time):
705
+ # return {"id": f"{book_id}_{speaker_id}_{str(begin_time).replace('.', '_')}"}
706
+ # dataset = dataset.map(concat_ids, input_columns=["book_id", "speaker_id", "begin_time"], num_proc=24)
707
+ # metadata_dataset = metadata_dataset.map(concat_ids, input_columns=["book_id", "speaker_id", "begin_time"], num_proc=24)
708
+ # metadata_dataset = metadata_dataset.rename_column(id_column_name, f"metadata_{id_column_name}")
709
+
710
+ if dataset_dict["name"] != "parler-tts/mls_eng_10k":
711
+ if id_column_name is not None and id_column_name not in dataset.column_names:
712
+ raise ValueError(
713
+ f"id_column_name={id_column_name} but has not been found in the dataset columns"
714
+ f"- one of {', '.join(list(dataset.column_names))}."
715
+ )
716
+ if id_column_name is not None and id_column_name not in metadata_dataset.column_names:
717
+ raise ValueError(
718
+ f"id_column_name={id_column_name} but has not been found in the metadata dataset columns"
719
+ f"- one of {', '.join(list(metadata_dataset.column_names))}."
720
+ )
721
+ elif id_column_name is not None:
722
+ metadata_dataset = metadata_dataset.rename_column(id_column_name, f"metadata_{id_column_name}")
723
+
724
+ metadata_columns_to_remove = set(metadata_dataset.column_names).intersection(set(dataset.column_names))
725
+
726
+ if prompt_column_name is not None:
727
+ # We might have applied some transformations to the prompts (e.g punctuation restoration)
728
+ # so we make sure to remove it from the original dataset
729
+ if prompt_column_name in dataset.column_names:
730
+ logger.info(
731
+ f"REMOVE {prompt_column_name} from dataset {dataset_dict['name']} - dataset_dict['split']"
732
+ )
733
+ dataset.remove_columns(prompt_column_name)
734
+
735
+ metadata_columns_to_remove = set(metadata_dataset.column_names).intersection(set(dataset.column_names))
736
+ metadata_dataset = metadata_dataset.remove_columns(metadata_columns_to_remove)
737
+
738
+ dataset = concatenate_datasets([dataset, metadata_dataset], axis=1)
739
+
740
+ if id_column_name is not None and dataset_dict["name"] != "parler-tts/mls_eng_10k":
741
+ if (
742
+ len(
743
+ dataset.filter(
744
+ lambda id1, id2: id1 != id2,
745
+ input_columns=[id_column_name, f"metadata_{id_column_name}"],
746
+ )
747
+ )
748
+ != 0
749
+ ):
750
+ raise ValueError(
751
+ f"Concatenate didn't work. Some ids don't correspond on dataset {dataset_dict['name']}"
752
+ )
753
+
754
+ dataset_features = dataset.features.keys()
755
+
756
+ if columns_to_keep is not None:
757
+ dataset = dataset.remove_columns(set(dataset_features - columns_to_keep))
758
+ all_datasets.append(dataset)
759
+
760
+ if len(all_datasets) == 1:
761
+ # we have a single dataset so just return it as is
762
+ return all_datasets[0]
763
+
764
+ if streaming:
765
+ interleaved_dataset = interleave_datasets(
766
+ all_datasets,
767
+ stopping_strategy=stopping_strategy,
768
+ probabilities=probabilities,
769
+ seed=seed,
770
+ )
771
+ else:
772
+ with accelerator.main_process_first():
773
+ interleaved_dataset = concatenate_datasets(all_datasets)
774
+
775
+ return interleaved_dataset
776
+
777
+
778
+ def main():
779
+ # See all possible arguments in src/transformers/training_args.py
780
+ # or by passing the --help flag to this script.
781
+ # We now keep distinct sets of args, for a cleaner separation of concerns.
782
+
783
+ parser = HfArgumentParser((ModelArguments, DataTrainingArguments, ParlerTTSTrainingArguments))
784
+ if len(sys.argv) == 2 and sys.argv[1].endswith(".json"):
785
+ # If we pass only one argument to the script and it's the path to a json file,
786
+ # let's parse it to get our arguments.
787
+ model_args, data_args, training_args = parser.parse_json_file(json_file=os.path.abspath(sys.argv[1]))
788
+ else:
789
+ model_args, data_args, training_args = parser.parse_args_into_dataclasses()
790
+
791
+ # Sending telemetry. Tracking the example usage helps us better allocate resources to maintain them. The
792
+ # information sent is the one passed as arguments along with your Python/PyTorch versions.
793
+ send_example_telemetry("run_parler_tts", model_args, data_args)
794
+
795
+ if training_args.dtype == "float16":
796
+ mixed_precision = "fp16"
797
+ elif training_args.dtype == "bfloat16":
798
+ mixed_precision = "bf16"
799
+ else:
800
+ mixed_precision = "no"
801
+
802
+ if data_args.pad_to_max_length and (
803
+ data_args.max_duration_in_seconds is None
804
+ or data_args.max_prompt_token_length is None
805
+ or data_args.max_description_token_length is None
806
+ ):
807
+ raise ValueError(
808
+ "`pad_to_max_length` is `True` but one of the following parameters has not been set: `max_duration_in_seconds`, `max_prompt_token_length`, `max_description_token_length`"
809
+ )
810
+
811
+ padding = "max_length" if data_args.pad_to_max_length else "longest"
812
+
813
+ ####### A. Preparation
814
+ kwargs_handlers = [InitProcessGroupKwargs(timeout=timedelta(minutes=60))]
815
+ if training_args.torch_compile:
816
+ # TODO(YL): add more compile modes?
817
+ kwargs_handlers.append(TorchDynamoPlugin(backend="inductor", mode="default")) # reduce-overhead
818
+
819
+ accelerator = Accelerator(
820
+ gradient_accumulation_steps=training_args.gradient_accumulation_steps,
821
+ mixed_precision=mixed_precision,
822
+ log_with=training_args.report_to,
823
+ project_dir=training_args.output_dir,
824
+ kwargs_handlers=kwargs_handlers,
825
+ )
826
+
827
+ accelerator.init_trackers(
828
+ project_name=data_args.wandb_project,
829
+ config={
830
+ "learning_rate": training_args.learning_rate,
831
+ "model_name_or_path": model_args.model_name_or_path,
832
+ "num_train_epochs": training_args.num_train_epochs,
833
+ "gradient_accumulation_steps": training_args.gradient_accumulation_steps,
834
+ "per_device_train_batch_size": training_args.per_device_train_batch_size,
835
+ "global_batch_size": training_args.per_device_train_batch_size * accelerator.num_processes,
836
+ "mixed_precision": mixed_precision,
837
+ "lr_scheduler_type": training_args.lr_scheduler_type,
838
+ "warmup_steps": training_args.warmup_steps,
839
+ "freeze_text_encoder": model_args.freeze_text_encoder,
840
+ "max_duration_in_seconds": data_args.max_duration_in_seconds,
841
+ "weight_decay": training_args.weight_decay,
842
+ "adam_beta1": training_args.adam_beta1,
843
+ "adam_beta2": training_args.adam_beta2,
844
+ "temperature": model_args.temperature,
845
+ },
846
+ )
847
+
848
+ # Detecting last checkpoint and eventually continue from last checkpoint
849
+ last_checkpoint = None
850
+ if os.path.isdir(training_args.output_dir) and training_args.do_train and not training_args.overwrite_output_dir:
851
+ last_checkpoint = get_last_checkpoint(training_args.output_dir)
852
+ if last_checkpoint is None and len(os.listdir(training_args.output_dir)) > 0:
853
+ raise ValueError(
854
+ f"Output directory ({training_args.output_dir}) already exists and is not empty. "
855
+ "Use --overwrite_output_dir to overcome."
856
+ )
857
+ elif last_checkpoint is not None and training_args.resume_from_checkpoint is None:
858
+ logger.info(
859
+ f"Checkpoint detected, resuming training at {last_checkpoint}. To avoid this behavior, change "
860
+ "the `--output_dir` or add `--overwrite_output_dir` to train from scratch."
861
+ )
862
+
863
+ # Setup logging
864
+ logging.basicConfig(
865
+ format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
866
+ datefmt="%m/%d/%Y %H:%M:%S",
867
+ handlers=[logging.StreamHandler(sys.stdout)],
868
+ )
869
+ logger.setLevel(logging.INFO if accelerator.is_main_process else logging.WARN)
870
+
871
+ # Log a small summary on each proces
872
+ logger.warning(
873
+ f"Process rank: {training_args.local_rank}, device: {training_args.device}, n_gpu: {training_args.n_gpu}, "
874
+ f"distributed training: {training_args.parallel_mode.value == 'distributed'}, 16-bits training: {training_args.fp16}"
875
+ )
876
+
877
+ # Set the verbosity to info of the Transformers logger (on main process only)
878
+ if accelerator.is_local_main_process:
879
+ datasets.utils.logging.set_verbosity_warning()
880
+ transformers.utils.logging.set_verbosity_info()
881
+ else:
882
+ datasets.utils.logging.set_verbosity_error()
883
+ transformers.utils.logging.set_verbosity_error()
884
+
885
+ logger.info("Training/evaluation parameters %s", training_args)
886
+
887
+ # Set seed before initializing model.
888
+ set_seed(training_args.seed)
889
+ num_workers = data_args.preprocessing_num_workers
890
+
891
+ # 1. First, lett's instantiate the feature extractor, tokenizers and model
892
+ # Note for distributed training, the .from_pretrained methods guarantee that only
893
+ # one local process can concurrently download model & vocab.
894
+
895
+ # load feature extractor
896
+ feature_extractor = AutoFeatureExtractor.from_pretrained(
897
+ model_args.feature_extractor_name or model_args.model_name_or_path,
898
+ cache_dir=model_args.cache_dir,
899
+ token=data_args.token,
900
+ trust_remote_code=data_args.trust_remote_code,
901
+ )
902
+ sampling_rate = feature_extractor.sampling_rate
903
+
904
+ # load prompt tokenizer
905
+ prompt_tokenizer = AutoTokenizer.from_pretrained(
906
+ model_args.prompt_tokenizer_name or model_args.description_tokenizer_name or model_args.model_name_or_path,
907
+ cache_dir=model_args.cache_dir,
908
+ token=data_args.token,
909
+ trust_remote_code=data_args.trust_remote_code,
910
+ use_fast=model_args.use_fast_tokenizer,
911
+ padding_side="left", # prompt has to be padded on the left bc it's preprend to codebooks hidden states
912
+ )
913
+
914
+ # load description tokenizer
915
+ description_tokenizer = AutoTokenizer.from_pretrained(
916
+ model_args.description_tokenizer_name or model_args.model_name_or_path,
917
+ cache_dir=model_args.cache_dir,
918
+ token=data_args.token,
919
+ trust_remote_code=data_args.trust_remote_code,
920
+ use_fast=model_args.use_fast_tokenizer,
921
+ )
922
+
923
+ if model_args.use_fast_tokenizer:
924
+ logger.warning(
925
+ "Disabling fast tokenizer warning: https://github.com/huggingface/transformers/blob/main/src/transformers/tokenization_utils_base.py#L3231-L3235"
926
+ )
927
+ prompt_tokenizer.deprecation_warnings["Asking-to-pad-a-fast-tokenizer"] = True
928
+ description_tokenizer.deprecation_warnings["Asking-to-pad-a-fast-tokenizer"] = True
929
+
930
+ # 2. Now, let's load the dataset
931
+
932
+ if data_args.save_to_disk is not None:
933
+ os.makedirs(data_args.save_to_disk, exist_ok=True)
934
+
935
+ # assume that the dataset has been saved to `save_to_disk` if the latter is not empty
936
+ dataset_was_precomputed = len(os.listdir(data_args.save_to_disk)) > 0
937
+ if dataset_was_precomputed:
938
+ vectorized_datasets = datasets.load_from_disk(data_args.save_to_disk)
939
+ else:
940
+ raw_datasets = DatasetDict()
941
+
942
+ columns_to_keep = {
943
+ "target_audio_column_name": data_args.target_audio_column_name,
944
+ "prompt_column_name": data_args.prompt_column_name,
945
+ }
946
+ if data_args.description_column_name is not None:
947
+ columns_to_keep["description_column_name"] = data_args.description_column_name
948
+
949
+ if training_args.do_train:
950
+ raw_datasets["train"] = load_multiple_datasets(
951
+ accelerator,
952
+ data_args.train_dataset_name,
953
+ data_args.train_dataset_config_name,
954
+ metadata_dataset_names=data_args.train_metadata_dataset_name,
955
+ splits=data_args.train_split_name,
956
+ dataset_samples=data_args.train_dataset_samples,
957
+ seed=training_args.seed,
958
+ cache_dir=model_args.cache_dir,
959
+ num_proc=data_args.preprocessing_num_workers,
960
+ id_column_name=data_args.id_column_name,
961
+ columns_to_keep=columns_to_keep.values(),
962
+ prompt_column_name=data_args.prompt_column_name,
963
+ audio_column_name=data_args.target_audio_column_name,
964
+ sampling_rate=sampling_rate,
965
+ # streaming=data_args.streaming, TODO(SG): optionally enable streaming mode
966
+ )
967
+
968
+ for key in columns_to_keep:
969
+ if columns_to_keep[key] not in raw_datasets["train"].column_names:
970
+ raise ValueError(
971
+ f"--{key} '{columns_to_keep[key]}' not found in dataset '{data_args.train_dataset_name}'."
972
+ f" Make sure to set `--{key}` to the correct audio column - one of"
973
+ f" {', '.join(raw_datasets['train'].column_names)}."
974
+ )
975
+
976
+ if data_args.max_train_samples is not None:
977
+ raw_datasets["train"] = raw_datasets["train"].select(range(data_args.max_train_samples))
978
+
979
+ if training_args.do_eval:
980
+ raw_datasets["eval"] = load_multiple_datasets(
981
+ accelerator,
982
+ data_args.eval_dataset_name if data_args.eval_dataset_name else data_args.train_dataset_name,
983
+ data_args.eval_dataset_config_name
984
+ if data_args.eval_dataset_config_name
985
+ else data_args.train_dataset_config_name,
986
+ metadata_dataset_names=data_args.eval_metadata_dataset_name,
987
+ splits=data_args.eval_split_name,
988
+ cache_dir=model_args.cache_dir,
989
+ num_proc=data_args.preprocessing_num_workers,
990
+ id_column_name=data_args.id_column_name,
991
+ columns_to_keep=columns_to_keep.values(),
992
+ prompt_column_name=data_args.prompt_column_name,
993
+ audio_column_name=data_args.target_audio_column_name,
994
+ sampling_rate=sampling_rate,
995
+ # streaming=data_args.streaming, TODO(SG): optionally enable streaming mode
996
+ )
997
+
998
+ if data_args.max_eval_samples is not None:
999
+ raw_datasets["eval"] = (
1000
+ raw_datasets["eval"].shuffle(seed=training_args.seed).select(range(data_args.max_eval_samples))
1001
+ )
1002
+
1003
+ # 3. Next, let's load the config.
1004
+ config = ParlerTTSConfig.from_pretrained(
1005
+ model_args.model_name_or_path,
1006
+ cache_dir=model_args.cache_dir,
1007
+ token=data_args.token,
1008
+ trust_remote_code=data_args.trust_remote_code,
1009
+ )
1010
+
1011
+ # update pad token id and decoder_start_token_id
1012
+ config.update(
1013
+ {
1014
+ "pad_token_id": model_args.pad_token_id if model_args.pad_token_id is not None else config.pad_token_id,
1015
+ "decoder_start_token_id": (
1016
+ model_args.decoder_start_token_id
1017
+ if model_args.decoder_start_token_id is not None
1018
+ else config.decoder_start_token_id
1019
+ ),
1020
+ }
1021
+ )
1022
+
1023
+ # create model
1024
+ model = ParlerTTSForConditionalGeneration.from_pretrained(
1025
+ model_args.model_name_or_path,
1026
+ cache_dir=model_args.cache_dir,
1027
+ config=config,
1028
+ token=data_args.token,
1029
+ trust_remote_code=data_args.trust_remote_code,
1030
+ )
1031
+
1032
+ # enable gradient checkpointing if necessary
1033
+ if training_args.gradient_checkpointing:
1034
+ model.gradient_checkpointing_enable()
1035
+
1036
+ # 4. Now we preprocess the datasets including loading the audio, resampling and normalization
1037
+ # Thankfully, `datasets` takes care of automatically loading and resampling the audio,
1038
+ # so that we just need to set the correct target sampling rate and normalize the input
1039
+ # via the `feature_extractor`
1040
+
1041
+ # derive max & min input length for sample rate & max duration
1042
+ sampling_rate = feature_extractor.sampling_rate
1043
+ max_target_length = data_args.max_duration_in_seconds * sampling_rate
1044
+ min_target_length = data_args.min_duration_in_seconds * sampling_rate
1045
+ target_audio_column_name = data_args.target_audio_column_name
1046
+ description_column_name = data_args.description_column_name
1047
+ prompt_column_name = data_args.prompt_column_name
1048
+ feature_extractor_input_name = feature_extractor.model_input_names[0]
1049
+ audio_encoder_pad_token_id = config.decoder.pad_token_id
1050
+ audio_encoder_eos_token_id = config.decoder.eos_token_id
1051
+ audio_encoder_bos_token_id = model.generation_config.decoder_start_token_id
1052
+ max_length = model.generation_config.max_length
1053
+ num_codebooks = model.decoder.config.num_codebooks
1054
+ bandwidth = model_args.bandwidth
1055
+
1056
+ # Freeze Encoders
1057
+ model.freeze_encoders(model_args.freeze_text_encoder)
1058
+
1059
+ # Test all gather - used for warmout and avoiding timeout
1060
+ test_tensor = torch.tensor([accelerator.process_index], device=accelerator.device)
1061
+ gathered_tensor = accelerator.gather(test_tensor)
1062
+ print("gathered_tensor", gathered_tensor)
1063
+ accelerator.wait_for_everyone()
1064
+
1065
+ if not dataset_was_precomputed:
1066
+ # Filter on text length
1067
+ if description_column_name is not None and data_args.max_text_length is not None:
1068
+ with accelerator.main_process_first():
1069
+ # filter description that is shorter than max_text_length
1070
+ raw_datasets = raw_datasets.filter(
1071
+ lambda x: len(x) < data_args.max_text_length,
1072
+ num_proc=num_workers,
1073
+ input_columns=[description_column_name],
1074
+ )
1075
+
1076
+ # Preprocessing the dataset.
1077
+ # We need to tokenize the texts.
1078
+ def pass_through_processors(description, prompt):
1079
+ batch = {}
1080
+
1081
+ batch["input_ids"] = description_tokenizer(description.strip())["input_ids"]
1082
+ batch["prompt_input_ids"] = prompt_tokenizer(prompt.strip())["input_ids"]
1083
+
1084
+ return batch
1085
+
1086
+ with accelerator.main_process_first():
1087
+ # this is a trick to avoid to rewrite the entire audio column which takes ages
1088
+ vectorized_datasets = raw_datasets.map(
1089
+ pass_through_processors,
1090
+ remove_columns=next(iter(raw_datasets.values())).column_names,
1091
+ input_columns=[description_column_name, prompt_column_name],
1092
+ num_proc=num_workers,
1093
+ desc="preprocess datasets",
1094
+ )
1095
+
1096
+ # We use Accelerate to perform distributed inference
1097
+ # T5 doesn't support fp16
1098
+ autocast_kwargs = AutocastKwargs(enabled=(mixed_precision != "fp16"))
1099
+
1100
+ # Now we encode the audio labels with encodec.
1101
+ ####### B. Encode audio
1102
+
1103
+ logger.info("*** Encode target audio with encodec ***")
1104
+
1105
+ # no need to prepare audio_decoder because used for inference without mixed precision
1106
+ # see: https://huggingface.co/docs/accelerate/main/en/package_reference/accelerator#accelerate.Accelerator.prepare
1107
+ if training_args.torch_compile:
1108
+ audio_decoder = accelerator.prepare_model(model.audio_encoder, evaluation_mode=True)
1109
+ else:
1110
+ audio_decoder = model.audio_encoder
1111
+
1112
+ encoder_data_collator = DataCollatorEncodecWithPadding(
1113
+ feature_extractor,
1114
+ audio_column_name=target_audio_column_name,
1115
+ feature_extractor_input_name=feature_extractor_input_name,
1116
+ max_length=max_target_length,
1117
+ padding=padding,
1118
+ )
1119
+
1120
+ def apply_audio_decoder(batch):
1121
+ len_audio = batch.pop("len_audio")
1122
+ audio_decoder.to(batch["input_values"].device).eval()
1123
+ with torch.no_grad():
1124
+ labels = audio_decoder.encode(**batch, bandwidth=bandwidth)["audio_codes"]
1125
+ output = {}
1126
+ output["len_audio"] = len_audio
1127
+ # (1, bsz, codebooks, seq_len) -> (bsz, seq_len, codebooks)
1128
+ output["labels"] = labels.squeeze(0).transpose(1, 2)
1129
+ output["ratio"] = torch.ones_like(len_audio) * labels.shape[-1] / len_audio.max()
1130
+ return output
1131
+
1132
+ for split in vectorized_datasets:
1133
+ data_loader = DataLoader(
1134
+ raw_datasets[split],
1135
+ batch_size=training_args.audio_encoder_per_device_batch_size,
1136
+ collate_fn=encoder_data_collator,
1137
+ num_workers=training_args.dataloader_num_workers,
1138
+ pin_memory=True,
1139
+ )
1140
+ data_loader = accelerator.prepare(data_loader)
1141
+
1142
+ all_generated_labels = []
1143
+ all_lens = []
1144
+ for batch in tqdm(data_loader, disable=not accelerator.is_local_main_process):
1145
+ generate_labels = apply_audio_decoder(batch)
1146
+ generate_labels = accelerator.pad_across_processes(generate_labels, dim=1, pad_index=0)
1147
+ generate_labels = accelerator.gather_for_metrics(generate_labels)
1148
+
1149
+ if accelerator.is_main_process:
1150
+ lab = generate_labels["labels"].cpu().transpose(1, 2).to(torch.int16)
1151
+ rat = generate_labels["ratio"].cpu().squeeze()
1152
+ lens = generate_labels["len_audio"].cpu().squeeze()
1153
+ lab = [l[:, : int(ratio * length)] for (l, ratio, length) in zip(lab, rat, lens)]
1154
+
1155
+ all_generated_labels.extend(lab)
1156
+ all_lens.extend(lens)
1157
+
1158
+ # (1, codebooks, seq_len) where seq_len=1
1159
+ bos_labels = torch.ones((1, num_codebooks, 1)) * audio_encoder_bos_token_id
1160
+
1161
+ if accelerator.is_main_process:
1162
+ tmp_labels = Dataset.from_dict({"labels": all_generated_labels, "target_length": all_lens})
1163
+ tmp_labels.save_to_disk(
1164
+ os.path.join(data_args.temporary_save_to_disk, split),
1165
+ num_proc=1 if split == "eval" else data_args.preprocessing_num_workers,
1166
+ )
1167
+ accelerator.wait_for_everyone()
1168
+ del all_generated_labels
1169
+
1170
+ tmp_labels = datasets.load_from_disk(os.path.join(data_args.temporary_save_to_disk, split))
1171
+ with accelerator.main_process_first():
1172
+ vectorized_datasets[split] = concatenate_datasets([vectorized_datasets[split], tmp_labels], axis=1)
1173
+
1174
+ def postprocess_dataset(labels):
1175
+ # (1, codebooks, seq_len)
1176
+ labels = torch.tensor(labels).unsqueeze(0)
1177
+ # add bos
1178
+ labels = torch.cat([bos_labels, labels], dim=-1)
1179
+
1180
+ labels, delay_pattern_mask = build_delay_pattern_mask(
1181
+ labels,
1182
+ bos_token_id=audio_encoder_bos_token_id,
1183
+ pad_token_id=audio_encoder_eos_token_id,
1184
+ max_length=labels.shape[-1] + num_codebooks,
1185
+ num_codebooks=num_codebooks,
1186
+ )
1187
+
1188
+ # the first ids of the delay pattern mask are precisely labels, we use the rest of the labels mask
1189
+ # to take care of EOS
1190
+ # we want labels to look like this:
1191
+ # - [B, a, b, E, E, E, E]
1192
+ # - [B, B, c, d, E, E, E]
1193
+ # - [B, B, B, e, f, E, E]
1194
+ # - [B, B, B, B, g, h, E]
1195
+ labels = torch.where(delay_pattern_mask == -1, audio_encoder_eos_token_id, delay_pattern_mask)
1196
+
1197
+ # the first timestamp is associated to a row full of BOS, let's get rid of it
1198
+ # we also remove the last timestampts (full of PAD)
1199
+ output = {"labels": labels[:, 1:]}
1200
+ return output
1201
+
1202
+ with accelerator.main_process_first():
1203
+ vectorized_datasets[split] = vectorized_datasets[split].map(
1204
+ postprocess_dataset,
1205
+ num_proc=data_args.preprocessing_num_workers, # this one is resource consuming if many processor.
1206
+ input_columns=["labels"],
1207
+ desc="Postprocessing labeling",
1208
+ )
1209
+
1210
+ accelerator.free_memory()
1211
+ del generate_labels, all_lens
1212
+
1213
+ with accelerator.main_process_first():
1214
+ # NOTE: filtering is done at the end because in the `datasets` library, caching audio files is done after most operations
1215
+ # caching audio files is time and disk-space consuming, so we want to avoid it at all costs, especially for large (>1Kh) audio datasets.
1216
+ # That's also why we avoid to concat the processed datasets (vectorized_datasets) with the audio column present in raw_datasets.
1217
+
1218
+ def is_audio_in_length_range(length):
1219
+ return length > min_target_length and length < max_target_length
1220
+
1221
+ # filter data that is shorter than min_target_length
1222
+ vectorized_datasets = vectorized_datasets.filter(
1223
+ is_audio_in_length_range,
1224
+ num_proc=num_workers,
1225
+ input_columns=["target_length"],
1226
+ )
1227
+
1228
+ if description_column_name is not None and data_args.max_description_token_length is not None:
1229
+ with accelerator.main_process_first():
1230
+ # filter description that is shorter than max_text_length
1231
+ vectorized_datasets = vectorized_datasets.filter(
1232
+ lambda x: len(x) < data_args.max_description_token_length,
1233
+ num_proc=num_workers,
1234
+ input_columns=["input_ids"],
1235
+ )
1236
+
1237
+ if data_args.max_prompt_token_length is not None:
1238
+ with accelerator.main_process_first():
1239
+ # filter description that is shorter than max_text_length
1240
+ vectorized_datasets = vectorized_datasets.filter(
1241
+ lambda x: len(x) < data_args.max_prompt_token_length,
1242
+ num_proc=num_workers,
1243
+ input_columns=["prompt_input_ids"],
1244
+ )
1245
+
1246
+ if data_args.save_to_disk is not None and not dataset_was_precomputed:
1247
+ if accelerator.is_main_process:
1248
+ vectorized_datasets.save_to_disk(
1249
+ data_args.save_to_disk,
1250
+ num_proc=min(data_args.preprocessing_num_workers, len(vectorized_datasets["eval"]) - 1),
1251
+ )
1252
+ logger.info(f"Dataset saved at {data_args.save_to_disk}")
1253
+
1254
+ audio_max_length = None
1255
+ if training_args.torch_compile:
1256
+ audio_max_length = max(vectorized_datasets["train"]["target_length"])
1257
+ with accelerator.main_process_first():
1258
+ max_sample = vectorized_datasets["train"].filter(
1259
+ lambda x: x == audio_max_length,
1260
+ num_proc=num_workers,
1261
+ input_columns=["target_length"],
1262
+ )
1263
+ audio_max_length = torch.tensor(max_sample[0]["labels"]).shape[1]
1264
+
1265
+ # for large datasets it is advised to run the preprocessing on a
1266
+ # single machine first with ``args.preprocessing_only`` since there will mostly likely
1267
+ # be a timeout when running the script in distributed mode.
1268
+ # In a second step ``args.preprocessing_only`` can then be set to `False` to load the
1269
+ # cached dataset
1270
+ if data_args.preprocessing_only and data_args.save_to_disk is None:
1271
+ raise ValueError(
1272
+ "`preprocessing_only=True` but `save_to_disk` is not set. The latter should indicates where to save the dataset locally."
1273
+ )
1274
+ elif data_args.preprocessing_only:
1275
+ logger.info(f"Data preprocessing finished. Files save at {data_args.save_to_disk}")
1276
+ return
1277
+
1278
+ # 6. Next, we can prepare the training.
1279
+
1280
+ # Let's use word CLAP similary and WER metrics as our evaluation metrics,
1281
+
1282
+ # Define evaluation metrics during training, *i.e.* CLAP similarity
1283
+ clap = AutoModel.from_pretrained(model_args.clap_model_name_or_path)
1284
+ clap_processor = AutoProcessor.from_pretrained(model_args.clap_model_name_or_path)
1285
+ metric = evaluate.load("wer")
1286
+
1287
+ def clap_similarity(texts, audios, device):
1288
+ clap_inputs = clap_processor(text=texts, audios=audios, padding=True, return_tensors="pt").to(device)
1289
+ clap.to(device)
1290
+ with torch.no_grad():
1291
+ text_features = clap.get_text_features(
1292
+ clap_inputs["input_ids"], attention_mask=clap_inputs.get("attention_mask", None)
1293
+ )
1294
+ audio_features = clap.get_audio_features(clap_inputs["input_features"])
1295
+
1296
+ cosine_sim = torch.nn.functional.cosine_similarity(audio_features, text_features, dim=1, eps=1e-8)
1297
+
1298
+ clap.to("cpu")
1299
+ clap_inputs.to("cpu")
1300
+ return cosine_sim.mean().to("cpu")
1301
+
1302
+ def wer(prompts, audios, device):
1303
+ asr_pipeline = pipeline(model=model_args.asr_model_name_or_path, device=device)
1304
+ transcriptions = asr_pipeline(
1305
+ [{"raw": audio, "sampling_rate": sampling_rate} for audio in audios],
1306
+ batch_size=int(training_args.per_device_eval_batch_size),
1307
+ )
1308
+
1309
+ word_error = 100 * metric.compute(
1310
+ predictions=[t["text"].lower() for t in transcriptions], references=[t.lower() for t in prompts]
1311
+ )
1312
+
1313
+ return word_error, [t["text"] for t in transcriptions]
1314
+
1315
+ eval_methods = {"clap": clap_similarity, "wer": wer}
1316
+
1317
+ def compute_metrics(audios, descriptions, prompts, device="cpu"):
1318
+ input_ids = descriptions
1319
+ texts = description_tokenizer.batch_decode(input_ids, skip_special_tokens=True)
1320
+ prompts = prompt_tokenizer.batch_decode(prompts, skip_special_tokens=True)
1321
+ audios = [a.cpu().numpy() for a in audios]
1322
+ results = {"clap": eval_methods["clap"](texts, audios, device)}
1323
+ word_error, transcriptions = eval_methods["wer"](prompts, audios, device)
1324
+ results["wer"] = word_error
1325
+
1326
+ return results, texts, prompts, audios, transcriptions
1327
+
1328
+ # Define Training Schedule
1329
+ # Store some constants
1330
+ per_device_train_batch_size = int(training_args.per_device_train_batch_size)
1331
+ train_batch_size = per_device_train_batch_size * accelerator.num_processes
1332
+ gradient_accumulation_steps = int(training_args.gradient_accumulation_steps)
1333
+ per_device_eval_batch_size = int(training_args.per_device_eval_batch_size)
1334
+
1335
+ if training_args.max_steps < 0:
1336
+ num_epochs = int(training_args.num_train_epochs)
1337
+ steps_per_epoch = len(vectorized_datasets["train"]) // (train_batch_size * gradient_accumulation_steps)
1338
+ total_train_steps = steps_per_epoch * num_epochs
1339
+ elif training_args.max_steps > 0:
1340
+ logger.info("max_steps is given, it will override any value given in num_train_epochs")
1341
+ total_train_steps = int(training_args.max_steps)
1342
+ # Setting a very large number of epochs so we go as many times as necessary over the iterator.
1343
+ num_epochs = sys.maxsize
1344
+ steps_per_epoch = total_train_steps
1345
+
1346
+ if training_args.evaluation_strategy == "epoch":
1347
+ eval_steps = steps_per_epoch
1348
+ elif training_args.eval_steps is None:
1349
+ logger.info(f"eval_steps is not set, evaluating at the end of each epoch")
1350
+ eval_steps = steps_per_epoch
1351
+ else:
1352
+ eval_steps = training_args.eval_steps
1353
+
1354
+ if training_args.save_strategy == "epoch":
1355
+ save_steps = steps_per_epoch
1356
+ elif training_args.save_strategy == "steps":
1357
+ save_steps = training_args.save_steps
1358
+ else:
1359
+ save_steps = sys.maxsize
1360
+
1361
+ # T5 doesn't support fp16
1362
+ autocast_kwargs = AutocastKwargs(enabled=(mixed_precision != "fp16"))
1363
+
1364
+ # Define optimizer, LR scheduler, collator
1365
+ optimizer = torch.optim.AdamW(
1366
+ params=model.parameters(),
1367
+ lr=training_args.learning_rate,
1368
+ betas=(training_args.adam_beta1, training_args.adam_beta2),
1369
+ eps=training_args.adam_epsilon,
1370
+ weight_decay=training_args.weight_decay,
1371
+ )
1372
+
1373
+ # LR scheduler gets stepped by `num_processes` each time -> account for this in warmup / total steps
1374
+ lr_scheduler = get_scheduler(
1375
+ name=training_args.lr_scheduler_type,
1376
+ optimizer=optimizer,
1377
+ num_warmup_steps=training_args.get_warmup_steps(total_train_steps) * accelerator.num_processes,
1378
+ num_training_steps=total_train_steps * accelerator.num_processes,
1379
+ )
1380
+
1381
+ # Instantiate custom data collator
1382
+ data_collator = DataCollatorParlerTTSWithPadding(
1383
+ prompt_tokenizer=prompt_tokenizer,
1384
+ description_tokenizer=description_tokenizer,
1385
+ pad_to_multiple_of=data_args.pad_to_multiple_of,
1386
+ padding=padding,
1387
+ prompt_max_length=data_args.max_prompt_token_length,
1388
+ description_max_length=data_args.max_description_token_length,
1389
+ audio_max_length=audio_max_length,
1390
+ )
1391
+
1392
+ # Prepare everything with accelerate
1393
+ model, optimizer, lr_scheduler = accelerator.prepare(model, optimizer, lr_scheduler)
1394
+
1395
+ logger.info("***** Running training *****")
1396
+ logger.info(f" Num examples = {total_train_steps * train_batch_size * gradient_accumulation_steps}")
1397
+ logger.info(" Instantaneous batch size per device =" f" {per_device_train_batch_size}")
1398
+ logger.info(" Gradient accumulation steps =" f" {gradient_accumulation_steps}")
1399
+ logger.info(
1400
+ f" Total train batch size (w. parallel & distributed) = {train_batch_size * gradient_accumulation_steps}"
1401
+ )
1402
+ logger.info(f" Total optimization steps = {total_train_steps}")
1403
+
1404
+ # ======================== Training ================================
1405
+ train_time = 0
1406
+ train_start = time.time()
1407
+ steps_trained_progress_bar = tqdm(
1408
+ range(total_train_steps), desc="Train steps ... ", position=0, disable=not accelerator.is_local_main_process
1409
+ )
1410
+ continue_training = True
1411
+ epochs_trained = 0
1412
+ cur_step = 0
1413
+
1414
+ checkpoint = None
1415
+ if training_args.resume_from_checkpoint is not None:
1416
+ checkpoint = training_args.resume_from_checkpoint
1417
+ elif last_checkpoint is not None:
1418
+ checkpoint = last_checkpoint
1419
+
1420
+ if accelerator.is_main_process:
1421
+ if training_args.push_to_hub:
1422
+ # Retrieve of infer repo_name
1423
+ repo_name = training_args.hub_model_id
1424
+ if repo_name is None:
1425
+ repo_name = Path(training_args.output_dir).absolute().name
1426
+ # Create repo and retrieve repo_id
1427
+ repo_id = create_repo(repo_name, exist_ok=True, token=training_args.hub_token).repo_id
1428
+ # Clone repo locally
1429
+ repo = Repository(training_args.output_dir, clone_from=repo_id, token=training_args.hub_token)
1430
+
1431
+ with open(os.path.join(training_args.output_dir, ".gitignore"), "w+") as gitignore:
1432
+ if "wandb" not in gitignore:
1433
+ gitignore.write("wandb\n")
1434
+ elif training_args.output_dir is not None:
1435
+ os.makedirs(training_args.output_dir, exist_ok=True)
1436
+ accelerator.wait_for_everyone()
1437
+
1438
+ # Now save everything to be able to create a single processor later
1439
+ # make sure all processes wait until data is saved
1440
+ with accelerator.main_process_first():
1441
+ # only the main process saves them
1442
+ if accelerator.is_main_process:
1443
+ # save feature extractor, tokenizer and config
1444
+ if (
1445
+ model_args.prompt_tokenizer_name is None
1446
+ and model_args.description_tokenizer_name
1447
+ or (model_args.prompt_tokenizer_name == model_args.description_tokenizer_name)
1448
+ ):
1449
+ prompt_tokenizer.save_pretrained(training_args.output_dir)
1450
+ else:
1451
+ logger.warning(
1452
+ "Prompt tokenizer ('{model_args.prompt_tokenizer_name}') and description tokenizer ('{model_args.description_tokenizer_name}') are not the same. Saving only the prompt tokenizer."
1453
+ )
1454
+ prompt_tokenizer.save_pretrained(training_args.output_dir)
1455
+
1456
+ feature_extractor.save_pretrained(training_args.output_dir)
1457
+ config.save_pretrained(training_args.output_dir)
1458
+
1459
+ if checkpoint is not None:
1460
+ accelerator.load_state(checkpoint)
1461
+ # Find num steps and epoch from saved state string pattern
1462
+ pattern = r"checkpoint-(\d+)-epoch-(\d+)"
1463
+ match = re.search(pattern, checkpoint)
1464
+ cur_step = int(match.group(1))
1465
+ epochs_trained = int(match.group(2))
1466
+
1467
+ logger.info(" Continuing training from checkpoint, will skip to saved global_step")
1468
+ logger.info(f" Continuing training from epoch {epochs_trained}")
1469
+ logger.info(f" Continuing training from global step {cur_step}")
1470
+
1471
+ steps_trained_progress_bar.update(cur_step)
1472
+
1473
+ for epoch in range(0, epochs_trained):
1474
+ vectorized_datasets["train"] = vectorized_datasets["train"].shuffle(training_args.seed)
1475
+
1476
+ if training_args.max_steps < 0:
1477
+ # we know exactly the number of steps per epoch, so can skip through the required number of batches
1478
+ resume_step = (cur_step - epochs_trained * steps_per_epoch) * gradient_accumulation_steps
1479
+ else:
1480
+ # Currently we don't know how many steps we've taken in the current epoch
1481
+ # So we just shuffle the dataset one extra time and start from a fresh epoch
1482
+ # This is "good enough" for our purposes but not fully correct
1483
+ resume_step = None
1484
+ vectorized_datasets["train"] = vectorized_datasets["train"].shuffle(training_args.seed)
1485
+ else:
1486
+ resume_step = None
1487
+
1488
+ gen_kwargs = {
1489
+ "do_sample": model_args.do_sample,
1490
+ "temperature": model_args.temperature,
1491
+ "max_length": model_args.max_length,
1492
+ }
1493
+
1494
+ # Define gradient update step fn
1495
+ def train_step(
1496
+ batch,
1497
+ accelerator,
1498
+ autocast_kwargs,
1499
+ ):
1500
+ model.train()
1501
+
1502
+ if mixed_precision == "fp16":
1503
+ # fp16 doesn't work with T5-like models
1504
+ with accelerator.autocast(autocast_handler=autocast_kwargs):
1505
+ if training_args.parallel_mode.value != "distributed":
1506
+ encoder_outputs = model.text_encoder(
1507
+ input_ids=batch.get("input_ids"), attention_mask=batch.get("attention_mask", None)
1508
+ )
1509
+ else:
1510
+ encoder_outputs = model.module.text_encoder(
1511
+ input_ids=batch.get("input_ids"), attention_mask=batch.get("attention_mask", None)
1512
+ )
1513
+ batch["encoder_outputs"] = encoder_outputs
1514
+
1515
+ outputs = model(**batch)
1516
+ # CE (data) loss
1517
+ ce_loss = outputs.loss
1518
+
1519
+ metrics = {"loss": ce_loss}
1520
+ return ce_loss, metrics
1521
+
1522
+ # Define eval fn
1523
+ def eval_step(
1524
+ batch,
1525
+ accelerator,
1526
+ autocast_kwargs,
1527
+ ):
1528
+ eval_model = model if not training_args.torch_compile else model._orig_mod
1529
+ eval_model.eval()
1530
+
1531
+ if mixed_precision == "fp16":
1532
+ # fp16 doesn't work with T5-like models
1533
+ with accelerator.autocast(autocast_handler=autocast_kwargs):
1534
+ with torch.no_grad():
1535
+ if training_args.parallel_mode.value != "distributed" or training_args.torch_compile:
1536
+ encoder_outputs = eval_model.text_encoder(
1537
+ input_ids=batch.get("input_ids"), attention_mask=batch.get("attention_mask", None)
1538
+ )
1539
+ else:
1540
+ encoder_outputs = eval_model.module.text_encoder(
1541
+ input_ids=batch.get("input_ids"), attention_mask=batch.get("attention_mask", None)
1542
+ )
1543
+ batch["encoder_outputs"] = encoder_outputs
1544
+
1545
+ with torch.no_grad():
1546
+ outputs = eval_model(**batch)
1547
+ # CE (data) loss
1548
+ ce_loss = outputs.loss
1549
+ metrics = {"loss": ce_loss}
1550
+ return metrics
1551
+
1552
+ def generate_step(batch):
1553
+ batch.pop("decoder_attention_mask", None)
1554
+ eval_model = accelerator.unwrap_model(model, keep_fp32_wrapper=mixed_precision != "fp16").eval()
1555
+ if training_args.torch_compile:
1556
+ eval_model = model._orig_mod
1557
+
1558
+ output_audios = eval_model.generate(**batch, **gen_kwargs)
1559
+ output_audios = accelerator.pad_across_processes(output_audios, dim=1, pad_index=0)
1560
+ return output_audios
1561
+
1562
+ for epoch in range(epochs_trained, num_epochs):
1563
+ vectorized_datasets["train"] = vectorized_datasets["train"].shuffle(training_args.seed)
1564
+ sampler = None
1565
+ if training_args.group_by_length:
1566
+ sampler = LengthGroupedSampler(train_batch_size, lengths=vectorized_datasets["train"]["target_length"])
1567
+ train_dataloader = DataLoader(
1568
+ vectorized_datasets["train"],
1569
+ collate_fn=data_collator,
1570
+ batch_size=per_device_train_batch_size,
1571
+ sampler=sampler,
1572
+ num_workers=training_args.dataloader_num_workers,
1573
+ pin_memory=training_args.dataloader_pin_memory,
1574
+ )
1575
+ train_dataloader = accelerator.prepare(train_dataloader)
1576
+ if hasattr(train_dataloader, "dataset") and isinstance(train_dataloader.dataset, IterableDataset):
1577
+ train_dataloader.dataset.set_epoch(epoch)
1578
+
1579
+ if resume_step is not None:
1580
+ # Skip the first N batches in the dataloader when resuming from a checkpoint
1581
+ train_dataloader = accelerator.skip_first_batches(train_dataloader, resume_step)
1582
+ resume_step = None
1583
+
1584
+ for batch in train_dataloader:
1585
+ with accelerator.accumulate(model):
1586
+ loss, train_metric = train_step(batch, accelerator, autocast_kwargs)
1587
+ accelerator.backward(loss)
1588
+ if accelerator.sync_gradients:
1589
+ accelerator.clip_grad_norm_(model.parameters(), training_args.max_grad_norm)
1590
+ optimizer.step()
1591
+ lr_scheduler.step()
1592
+ optimizer.zero_grad()
1593
+
1594
+ # Check if the accelerator has performed an optimization step behind the scenes
1595
+ if accelerator.sync_gradients:
1596
+ steps_trained_progress_bar.update(1)
1597
+ cur_step += 1
1598
+
1599
+ if cur_step % training_args.logging_steps == 0:
1600
+ steps_trained_progress_bar.write(
1601
+ f"Step... ({cur_step} / {total_train_steps} | Loss:"
1602
+ f" {train_metric['loss']}, Learning Rate:"
1603
+ f" {lr_scheduler.get_last_lr()[0]})"
1604
+ )
1605
+ log_metric(
1606
+ accelerator,
1607
+ metrics=train_metric,
1608
+ learning_rate=lr_scheduler.get_last_lr()[0],
1609
+ train_time=train_time + time.time() - train_start,
1610
+ step=cur_step,
1611
+ epoch=epoch + (cur_step - epoch * steps_per_epoch) / steps_per_epoch,
1612
+ prefix="train",
1613
+ )
1614
+
1615
+ # save checkpoint and weights after each save_steps and at the end of training
1616
+ if (cur_step % save_steps == 0) or cur_step == total_train_steps:
1617
+ intermediate_dir = os.path.join(training_args.output_dir, f"checkpoint-{cur_step}-epoch-{epoch}")
1618
+ # safe_serialization=False to avoid shared tensors saving issue (TODO(YL): it's a temporary fix)
1619
+ # https://github.com/huggingface/transformers/issues/27293#issuecomment-1872560074
1620
+ accelerator.save_state(output_dir=intermediate_dir, safe_serialization=False)
1621
+ accelerator.wait_for_everyone()
1622
+ if accelerator.is_main_process:
1623
+ rotate_checkpoints(training_args.save_total_limit, output_dir=training_args.output_dir)
1624
+
1625
+ if cur_step == total_train_steps:
1626
+ # un-wrap student model for save
1627
+ unwrapped_model = accelerator.unwrap_model(model)
1628
+ unwrapped_model.save_pretrained(training_args.output_dir)
1629
+
1630
+ if training_args.push_to_hub:
1631
+ repo.push_to_hub(
1632
+ commit_message=f"Saving train state of step {cur_step}",
1633
+ blocking=False,
1634
+ )
1635
+
1636
+ if training_args.do_eval and (cur_step % eval_steps == 0 or cur_step == total_train_steps):
1637
+ train_time += time.time() - train_start
1638
+ # ======================== Evaluating ==============================
1639
+ eval_metrics = []
1640
+ eval_preds = []
1641
+ eval_descriptions = []
1642
+ eval_prompts = []
1643
+ eval_start = time.time()
1644
+
1645
+ # release training input batch
1646
+ batch = release_memory(batch)
1647
+
1648
+ validation_dataloader = DataLoader(
1649
+ vectorized_datasets["eval"],
1650
+ collate_fn=data_collator,
1651
+ batch_size=per_device_eval_batch_size,
1652
+ drop_last=False,
1653
+ num_workers=training_args.dataloader_pin_memory,
1654
+ pin_memory=training_args.dataloader_pin_memory,
1655
+ )
1656
+ validation_dataloader = accelerator.prepare(validation_dataloader)
1657
+
1658
+ for batch in tqdm(
1659
+ validation_dataloader,
1660
+ desc="Evaluating - Inference ...",
1661
+ position=2,
1662
+ disable=not accelerator.is_local_main_process,
1663
+ ):
1664
+ # Model forward
1665
+ eval_metric = eval_step(batch, accelerator, autocast_kwargs)
1666
+ eval_metric = accelerator.gather_for_metrics(eval_metric)
1667
+ eval_metrics.append(eval_metric)
1668
+
1669
+ if training_args.predict_with_generate:
1670
+ validation_dataloader = DataLoader(
1671
+ vectorized_datasets["eval"],
1672
+ collate_fn=data_collator,
1673
+ batch_size=per_device_eval_batch_size,
1674
+ drop_last=False,
1675
+ num_workers=training_args.dataloader_pin_memory,
1676
+ pin_memory=training_args.dataloader_pin_memory,
1677
+ )
1678
+ validation_dataloader = accelerator.prepare(validation_dataloader)
1679
+ # generation
1680
+ for batch in tqdm(
1681
+ validation_dataloader,
1682
+ desc="Evaluating - Generation ...",
1683
+ position=2,
1684
+ disable=not accelerator.is_local_main_process,
1685
+ ):
1686
+ generated_audios = generate_step(batch)
1687
+ # Gather all predictions and targets
1688
+ generated_audios, input_ids, prompts = accelerator.pad_across_processes(
1689
+ (generated_audios, batch["input_ids"], batch["prompt_input_ids"]), dim=1, pad_index=0
1690
+ )
1691
+ generated_audios, input_ids, prompts = accelerator.gather_for_metrics(
1692
+ (generated_audios, input_ids, prompts)
1693
+ )
1694
+ eval_preds.extend(generated_audios.to("cpu"))
1695
+ eval_descriptions.extend(input_ids.to("cpu"))
1696
+ eval_prompts.extend(prompts.to("cpu"))
1697
+
1698
+ eval_time = time.time() - eval_start
1699
+ # normalize eval metrics
1700
+ eval_metrics = {
1701
+ key: torch.mean(torch.cat([d[key].unsqueeze(0) for d in eval_metrics]))
1702
+ for key in eval_metrics[0]
1703
+ }
1704
+
1705
+ # compute metrics
1706
+ metrics_desc = ""
1707
+ if training_args.predict_with_generate:
1708
+ metric_values, pred_descriptions, pred_prompts, audios, transcriptions = compute_metrics(
1709
+ eval_preds, eval_descriptions, eval_prompts, accelerator.device
1710
+ )
1711
+ eval_metrics.update(metric_values)
1712
+ metrics_desc = " ".join([f"Eval {key}: {value} |" for key, value in metric_values.items()])
1713
+ if "wandb" in training_args.report_to:
1714
+ log_pred(
1715
+ accelerator,
1716
+ pred_descriptions,
1717
+ pred_prompts,
1718
+ transcriptions,
1719
+ audios,
1720
+ sampling_rate=sampling_rate,
1721
+ step=cur_step,
1722
+ prefix="eval",
1723
+ )
1724
+
1725
+ # Print metrics and update progress bar
1726
+ steps_trained_progress_bar.write(
1727
+ f"Eval results for step ({cur_step} / {total_train_steps} | Eval Loss: {eval_metrics['loss']} |"
1728
+ f" {metrics_desc})"
1729
+ )
1730
+
1731
+ log_metric(
1732
+ accelerator,
1733
+ metrics=eval_metrics,
1734
+ train_time=eval_time,
1735
+ step=cur_step,
1736
+ epoch=epoch + (cur_step - epoch * steps_per_epoch) / steps_per_epoch,
1737
+ prefix="eval",
1738
+ )
1739
+
1740
+ # release eval batch and relax metrics
1741
+ eval_metrics = []
1742
+ eval_preds = []
1743
+ eval_descriptions = []
1744
+ eval_prompts = []
1745
+ batch = release_memory(batch)
1746
+
1747
+ # flush the train metrics
1748
+ train_start = time.time()
1749
+
1750
+ # break condition
1751
+ if cur_step == total_train_steps:
1752
+ continue_training = False
1753
+ break
1754
+
1755
+ if not continue_training:
1756
+ break
1757
+
1758
+ accelerator.end_training()
1759
+
1760
+
1761
+ if __name__ == "__main__":
1762
+ set_start_method("spawn")
1763
+ main()
wandb/debug-cli.sanchit.log ADDED
File without changes
wandb/debug-internal.log ADDED
The diff for this file is too large to render. See raw diff
 
wandb/debug.log ADDED
@@ -0,0 +1,35 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ 2024-05-13 20:52:49,015 INFO MainThread:1257680 [wandb_setup.py:_flush():76] Current SDK version is 0.17.0
2
+ 2024-05-13 20:52:49,015 INFO MainThread:1257680 [wandb_setup.py:_flush():76] Configure stats pid to 1257680
3
+ 2024-05-13 20:52:49,015 INFO MainThread:1257680 [wandb_setup.py:_flush():76] Loading settings from /home/sanchit/.config/wandb/settings
4
+ 2024-05-13 20:52:49,015 INFO MainThread:1257680 [wandb_setup.py:_flush():76] Loading settings from /raid/sanchit/parler-tts-mini-v0.1-expresso-concatenated-combined/wandb/settings
5
+ 2024-05-13 20:52:49,015 INFO MainThread:1257680 [wandb_setup.py:_flush():76] Loading settings from environment variables: {}
6
+ 2024-05-13 20:52:49,015 INFO MainThread:1257680 [wandb_setup.py:_flush():76] Applying setup settings: {'_disable_service': False}
7
+ 2024-05-13 20:52:49,015 INFO MainThread:1257680 [wandb_setup.py:_flush():76] Inferring run settings from compute environment: {'program_relpath': 'run_parler_tts_training.py', 'program_abspath': '/raid/sanchit/parler-tts-mini-v0.1-expresso-concatenated-combined/run_parler_tts_training.py', 'program': '/raid/sanchit/parler-tts-mini-v0.1-expresso-concatenated-combined/run_parler_tts_training.py'}
8
+ 2024-05-13 20:52:49,015 INFO MainThread:1257680 [wandb_setup.py:_flush():76] Applying login settings: {}
9
+ 2024-05-13 20:52:49,015 INFO MainThread:1257680 [wandb_init.py:_log_setup():520] Logging user logs to /raid/sanchit/parler-tts-mini-v0.1-expresso-concatenated-combined/wandb/run-20240513_205249-qaoje1x9/logs/debug.log
10
+ 2024-05-13 20:52:49,015 INFO MainThread:1257680 [wandb_init.py:_log_setup():521] Logging internal logs to /raid/sanchit/parler-tts-mini-v0.1-expresso-concatenated-combined/wandb/run-20240513_205249-qaoje1x9/logs/debug-internal.log
11
+ 2024-05-13 20:52:49,015 INFO MainThread:1257680 [wandb_init.py:init():560] calling init triggers
12
+ 2024-05-13 20:52:49,015 INFO MainThread:1257680 [wandb_init.py:init():567] wandb.init called with sweep_config: {}
13
+ config: {}
14
+ 2024-05-13 20:52:49,015 INFO MainThread:1257680 [wandb_init.py:init():610] starting backend
15
+ 2024-05-13 20:52:49,015 INFO MainThread:1257680 [wandb_init.py:init():614] setting up manager
16
+ 2024-05-13 20:52:49,019 INFO MainThread:1257680 [backend.py:_multiprocessing_setup():105] multiprocessing start_methods=fork,spawn,forkserver, using: spawn
17
+ 2024-05-13 20:52:49,019 INFO MainThread:1257680 [wandb_init.py:init():622] backend started and connected
18
+ 2024-05-13 20:52:49,021 INFO MainThread:1257680 [wandb_init.py:init():711] updated telemetry
19
+ 2024-05-13 20:52:49,024 INFO MainThread:1257680 [wandb_init.py:init():744] communicating run to backend with 90.0 second timeout
20
+ 2024-05-13 20:52:49,413 INFO MainThread:1257680 [wandb_run.py:_on_init():2396] communicating current version
21
+ 2024-05-13 20:52:49,474 INFO MainThread:1257680 [wandb_run.py:_on_init():2405] got version response
22
+ 2024-05-13 20:52:49,475 INFO MainThread:1257680 [wandb_init.py:init():795] starting run threads in backend
23
+ 2024-05-13 20:52:52,103 INFO MainThread:1257680 [wandb_run.py:_console_start():2374] atexit reg
24
+ 2024-05-13 20:52:52,103 INFO MainThread:1257680 [wandb_run.py:_redirect():2229] redirect: wrap_raw
25
+ 2024-05-13 20:52:52,103 INFO MainThread:1257680 [wandb_run.py:_redirect():2294] Wrapping output streams.
26
+ 2024-05-13 20:52:52,103 INFO MainThread:1257680 [wandb_run.py:_redirect():2319] Redirects installed.
27
+ 2024-05-13 20:52:52,104 INFO MainThread:1257680 [wandb_init.py:init():838] run started, returning control to user process
28
+ 2024-05-13 20:52:52,104 INFO MainThread:1257680 [wandb_run.py:_config_callback():1376] config_cb None None {'learning_rate': 8e-05, 'model_name_or_path': 'parler-tts/parler_tts_mini_v0.1', 'num_train_epochs': 8, 'gradient_accumulation_steps': 8, 'per_device_train_batch_size': 16, 'global_batch_size': 16, 'mixed_precision': 'bf16', 'lr_scheduler_type': 'SchedulerType.COSINE', 'warmup_steps': 250, 'freeze_text_encoder': True, 'max_duration_in_seconds': 30.0, 'weight_decay': 0.01, 'adam_beta1': 0.9, 'adam_beta2': 0.99, 'temperature': 1.0}
29
+ 2024-05-13 23:01:32,489 INFO MainThread:1257680 [wandb_run.py:_finish():2103] finishing run sanchit-gandhi/parler-speech/qaoje1x9
30
+ 2024-05-13 23:01:32,489 INFO MainThread:1257680 [wandb_run.py:_atexit_cleanup():2343] got exitcode: 0
31
+ 2024-05-13 23:01:32,489 INFO MainThread:1257680 [wandb_run.py:_restore():2326] restore
32
+ 2024-05-13 23:01:32,489 INFO MainThread:1257680 [wandb_run.py:_restore():2332] restore done
33
+ 2024-05-13 23:01:46,253 INFO MainThread:1257680 [wandb_run.py:_footer_history_summary_info():3994] rendering history
34
+ 2024-05-13 23:01:46,254 INFO MainThread:1257680 [wandb_run.py:_footer_history_summary_info():4026] rendering summary
35
+ 2024-05-13 23:01:46,256 INFO MainThread:1257680 [wandb_run.py:_footer_sync_info():3953] logging synced files
wandb/run-20240513_204652-m0g0ap7d/files/conda-environment.yaml ADDED
@@ -0,0 +1,248 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ name: venv
2
+ channels:
3
+ - defaults
4
+ dependencies:
5
+ - _libgcc_mutex=0.1=main
6
+ - _openmp_mutex=5.1=1_gnu
7
+ - bzip2=1.0.8=h5eee18b_6
8
+ - ca-certificates=2024.3.11=h06a4308_0
9
+ - ld_impl_linux-64=2.38=h1181459_1
10
+ - libffi=3.4.4=h6a678d5_1
11
+ - libgcc-ng=11.2.0=h1234567_1
12
+ - libgomp=11.2.0=h1234567_1
13
+ - libstdcxx-ng=11.2.0=h1234567_1
14
+ - libuuid=1.41.5=h5eee18b_0
15
+ - ncurses=6.4=h6a678d5_0
16
+ - openssl=3.0.13=h7f8727e_1
17
+ - pip=24.0=py311h06a4308_0
18
+ - python=3.11.9=h955ad1f_0
19
+ - readline=8.2=h5eee18b_0
20
+ - setuptools=69.5.1=py311h06a4308_0
21
+ - sqlite=3.45.3=h5eee18b_0
22
+ - tk=8.6.14=h39e8969_0
23
+ - wheel=0.43.0=py311h06a4308_0
24
+ - xz=5.4.6=h5eee18b_1
25
+ - zlib=1.2.13=h5eee18b_1
26
+ - pip:
27
+ - absl-py==2.1.0
28
+ - accelerate==0.30.0
29
+ - aiohttp==3.9.5
30
+ - aiosignal==1.3.1
31
+ - aniso8601==9.0.1
32
+ - annotated-types==0.6.0
33
+ - anyio==4.3.0
34
+ - argbind==0.3.7
35
+ - argon2-cffi==23.1.0
36
+ - argon2-cffi-bindings==21.2.0
37
+ - arrow==1.3.0
38
+ - asttokens==2.4.1
39
+ - async-lru==2.0.4
40
+ - attrs==23.2.0
41
+ - audioread==3.0.1
42
+ - babel==2.15.0
43
+ - beautifulsoup4==4.12.3
44
+ - bidict==0.23.1
45
+ - bitsandbytes==0.43.1
46
+ - bleach==6.1.0
47
+ - certifi==2024.2.2
48
+ - cffi==1.16.0
49
+ - charset-normalizer==3.3.2
50
+ - click==8.1.7
51
+ - coloredlogs==14.0
52
+ - comm==0.2.2
53
+ - contourpy==1.2.1
54
+ - cycler==0.12.1
55
+ - datasets==2.19.1
56
+ - debugpy==1.8.1
57
+ - decorator==5.1.1
58
+ - defusedxml==0.7.1
59
+ - descript-audio-codec==1.0.0
60
+ - descript-audiotools==0.7.2
61
+ - dill==0.3.8
62
+ - dnspython==2.3.0
63
+ - docker-pycreds==0.4.0
64
+ - docstring-parser==0.16
65
+ - editdistance==0.8.1
66
+ - einops==0.8.0
67
+ - et-xmlfile==1.1.0
68
+ - evaluate==0.4.2
69
+ - eventlet==0.36.1
70
+ - executing==2.0.1
71
+ - fastjsonschema==2.19.1
72
+ - ffmpy==0.3.2
73
+ - filelock==3.14.0
74
+ - fire==0.6.0
75
+ - flask==2.2.5
76
+ - flask-cors==4.0.1
77
+ - flask-restful==0.3.10
78
+ - flask-socketio==5.3.6
79
+ - flask-talisman==1.1.0
80
+ - flatten-dict==0.4.2
81
+ - fonttools==4.51.0
82
+ - fqdn==1.5.1
83
+ - frozenlist==1.4.1
84
+ - fsspec==2024.3.1
85
+ - future==1.0.0
86
+ - g2p==2.0.0
87
+ - gitdb==4.0.11
88
+ - gitpython==3.1.43
89
+ - greenlet==3.0.3
90
+ - grpcio==1.63.0
91
+ - h11==0.14.0
92
+ - httpcore==1.0.5
93
+ - httpx==0.27.0
94
+ - huggingface-hub==0.23.0
95
+ - humanfriendly==10.0
96
+ - idna==3.7
97
+ - importlib-resources==6.4.0
98
+ - ipdb==0.13.13
99
+ - ipykernel==6.29.4
100
+ - ipython==8.24.0
101
+ - isoduration==20.11.0
102
+ - itsdangerous==2.2.0
103
+ - jedi==0.19.1
104
+ - jinja2==3.1.4
105
+ - jiwer==3.0.4
106
+ - joblib==1.4.2
107
+ - json5==0.9.25
108
+ - jsonpointer==2.4
109
+ - jsonschema==4.22.0
110
+ - jsonschema-specifications==2023.12.1
111
+ - julius==0.2.7
112
+ - jupyter-client==8.6.1
113
+ - jupyter-core==5.7.2
114
+ - jupyter-events==0.10.0
115
+ - jupyter-lsp==2.2.5
116
+ - jupyter-server==2.14.0
117
+ - jupyter-server-terminals==0.5.3
118
+ - jupyterlab==4.2.0
119
+ - jupyterlab-pygments==0.3.0
120
+ - jupyterlab-server==2.27.1
121
+ - kiwisolver==1.4.5
122
+ - lazy-loader==0.4
123
+ - librosa==0.10.2
124
+ - llvmlite==0.42.0
125
+ - markdown==3.6
126
+ - markdown-it-py==3.0.0
127
+ - markdown2==2.4.13
128
+ - markupsafe==2.1.5
129
+ - matplotlib==3.8.4
130
+ - matplotlib-inline==0.1.7
131
+ - mdurl==0.1.2
132
+ - mistune==3.0.2
133
+ - mpmath==1.3.0
134
+ - msgpack==1.0.8
135
+ - multidict==6.0.5
136
+ - multiprocess==0.70.16
137
+ - munkres==1.1.4
138
+ - nbclient==0.10.0
139
+ - nbconvert==7.16.4
140
+ - nbformat==5.10.4
141
+ - nest-asyncio==1.6.0
142
+ - networkx==3.3
143
+ - notebook-shim==0.2.4
144
+ - numba==0.59.1
145
+ - numpy==1.26.4
146
+ - nvidia-cublas-cu12==12.1.3.1
147
+ - nvidia-cuda-cupti-cu12==12.1.105
148
+ - nvidia-cuda-nvrtc-cu12==12.1.105
149
+ - nvidia-cuda-runtime-cu12==12.1.105
150
+ - nvidia-cudnn-cu12==8.9.2.26
151
+ - nvidia-cufft-cu12==11.0.2.54
152
+ - nvidia-curand-cu12==10.3.2.106
153
+ - nvidia-cusolver-cu12==11.4.5.107
154
+ - nvidia-cusparse-cu12==12.1.0.106
155
+ - nvidia-nccl-cu12==2.20.5
156
+ - nvidia-nvjitlink-cu12==12.4.127
157
+ - nvidia-nvtx-cu12==12.1.105
158
+ - openpyxl==3.1.2
159
+ - overrides==7.7.0
160
+ - packaging==24.0
161
+ - pandas==2.2.2
162
+ - pandocfilters==1.5.1
163
+ - panphon==0.20.0
164
+ - parler-tts==0.1
165
+ - parso==0.8.4
166
+ - pexpect==4.9.0
167
+ - pillow==10.3.0
168
+ - platformdirs==4.2.1
169
+ - pooch==1.8.1
170
+ - prometheus-client==0.20.0
171
+ - prompt-toolkit==3.0.43
172
+ - protobuf==3.19.6
173
+ - psutil==5.9.8
174
+ - ptyprocess==0.7.0
175
+ - pure-eval==0.2.2
176
+ - pyarrow==16.0.0
177
+ - pyarrow-hotfix==0.6
178
+ - pycparser==2.22
179
+ - pydantic==2.7.1
180
+ - pydantic-core==2.18.2
181
+ - pygments==2.18.0
182
+ - pyloudnorm==0.1.1
183
+ - pyparsing==3.1.2
184
+ - pystoi==0.4.1
185
+ - python-dateutil==2.9.0.post0
186
+ - python-engineio==4.9.0
187
+ - python-json-logger==2.0.7
188
+ - python-socketio==5.11.2
189
+ - pytz==2024.1
190
+ - pyyaml==6.0.1
191
+ - pyzmq==26.0.3
192
+ - randomname==0.2.1
193
+ - rapidfuzz==3.9.0
194
+ - referencing==0.35.1
195
+ - regex==2024.4.28
196
+ - requests==2.31.0
197
+ - rfc3339-validator==0.1.4
198
+ - rfc3986-validator==0.1.1
199
+ - rich==13.7.1
200
+ - rpds-py==0.18.1
201
+ - safetensors==0.4.3
202
+ - scikit-learn==1.4.2
203
+ - scipy==1.13.0
204
+ - send2trash==1.8.3
205
+ - sentencepiece==0.2.0
206
+ - sentry-sdk==2.1.1
207
+ - setproctitle==1.3.3
208
+ - simple-websocket==1.0.0
209
+ - six==1.16.0
210
+ - smmap==5.0.1
211
+ - sniffio==1.3.1
212
+ - soundfile==0.12.1
213
+ - soupsieve==2.5
214
+ - soxr==0.3.7
215
+ - stack-data==0.6.3
216
+ - sympy==1.12
217
+ - tensorboard==2.16.2
218
+ - tensorboard-data-server==0.7.2
219
+ - termcolor==2.4.0
220
+ - terminado==0.18.1
221
+ - text-unidecode==1.3
222
+ - threadpoolctl==3.5.0
223
+ - tinycss2==1.3.0
224
+ - tokenizers==0.19.1
225
+ - torch==2.3.0
226
+ - torch-stoi==0.2.1
227
+ - torchaudio==2.3.0
228
+ - tornado==6.4
229
+ - tqdm==4.66.4
230
+ - traitlets==5.14.3
231
+ - transformers==4.41.0.dev0
232
+ - triton==2.3.0
233
+ - types-python-dateutil==2.9.0.20240316
234
+ - typing-extensions==4.11.0
235
+ - tzdata==2024.1
236
+ - unicodecsv==0.14.1
237
+ - uri-template==1.3.0
238
+ - urllib3==2.2.1
239
+ - wandb==0.17.0
240
+ - wcwidth==0.2.13
241
+ - webcolors==1.13
242
+ - webencodings==0.5.1
243
+ - websocket-client==1.8.0
244
+ - werkzeug==3.0.3
245
+ - wsproto==1.2.0
246
+ - xxhash==3.4.1
247
+ - yarl==1.9.4
248
+ prefix: /home/sanchit/miniconda3/envs/venv
wandb/run-20240513_204652-m0g0ap7d/files/config.yaml ADDED
@@ -0,0 +1,86 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ wandb_version: 1
2
+
3
+ _wandb:
4
+ desc: null
5
+ value:
6
+ python_version: 3.11.9
7
+ cli_version: 0.17.0
8
+ framework: huggingface
9
+ huggingface_version: 4.41.0.dev0
10
+ is_jupyter_run: false
11
+ is_kaggle_kernel: false
12
+ start_time: 1715626012
13
+ t:
14
+ 1:
15
+ - 1
16
+ - 5
17
+ - 11
18
+ - 49
19
+ - 51
20
+ - 53
21
+ - 55
22
+ - 71
23
+ - 100
24
+ 2:
25
+ - 1
26
+ - 5
27
+ - 11
28
+ - 49
29
+ - 51
30
+ - 53
31
+ - 55
32
+ - 71
33
+ - 100
34
+ 3:
35
+ - 23
36
+ 4: 3.11.9
37
+ 5: 0.17.0
38
+ 6: 4.41.0.dev0
39
+ 8:
40
+ - 5
41
+ 13: linux-x86_64
42
+ learning_rate:
43
+ desc: null
44
+ value: 8.0e-05
45
+ model_name_or_path:
46
+ desc: null
47
+ value: parler-tts/parler_tts_mini_v0.1
48
+ num_train_epochs:
49
+ desc: null
50
+ value: 8
51
+ gradient_accumulation_steps:
52
+ desc: null
53
+ value: 8
54
+ per_device_train_batch_size:
55
+ desc: null
56
+ value: 16
57
+ global_batch_size:
58
+ desc: null
59
+ value: 16
60
+ mixed_precision:
61
+ desc: null
62
+ value: bf16
63
+ lr_scheduler_type:
64
+ desc: null
65
+ value: SchedulerType.COSINE
66
+ warmup_steps:
67
+ desc: null
68
+ value: 250
69
+ freeze_text_encoder:
70
+ desc: null
71
+ value: true
72
+ max_duration_in_seconds:
73
+ desc: null
74
+ value: 30.0
75
+ weight_decay:
76
+ desc: null
77
+ value: 0.01
78
+ adam_beta1:
79
+ desc: null
80
+ value: 0.9
81
+ adam_beta2:
82
+ desc: null
83
+ value: 0.99
84
+ temperature:
85
+ desc: null
86
+ value: 1.0
wandb/run-20240513_204652-m0g0ap7d/files/output.log ADDED
@@ -0,0 +1,180 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ 05/13/2024 20:46:55 - WARNING - __main__ - Process rank: 0, device: cuda:0, n_gpu: 1, distributed training: False, 16-bits training: False
2
+ 05/13/2024 20:46:55 - INFO - __main__ - Training/evaluation parameters ParlerTTSTrainingArguments(
3
+ _n_gpu=1,
4
+ accelerator_config={'split_batches': False, 'dispatch_batches': None, 'even_batches': True, 'use_seedable_sampler': True, 'non_blocking': False, 'gradient_accumulation_kwargs': None},
5
+ adafactor=False,
6
+ adam_beta1=0.9,
7
+ adam_beta2=0.99,
8
+ adam_epsilon=1e-08,
9
+ audio_encoder_per_device_batch_size=4,
10
+ auto_find_batch_size=False,
11
+ batch_eval_metrics=False,
12
+ bf16=False,
13
+ bf16_full_eval=False,
14
+ data_seed=None,
15
+ dataloader_drop_last=False,
16
+ dataloader_num_workers=4,
17
+ dataloader_persistent_workers=False,
18
+ dataloader_pin_memory=True,
19
+ dataloader_prefetch_factor=None,
20
+ ddp_backend=None,
21
+ ddp_broadcast_buffers=None,
22
+ ddp_bucket_cap_mb=None,
23
+ ddp_find_unused_parameters=None,
24
+ ddp_timeout=1800,
25
+ debug=[],
26
+ deepspeed=None,
27
+ disable_tqdm=False,
28
+ dispatch_batches=None,
29
+ do_eval=True,
30
+ do_predict=False,
31
+ do_train=True,
32
+ dtype=bfloat16,
33
+ eval_accumulation_steps=None,
34
+ eval_delay=0,
35
+ eval_do_concat_batches=True,
36
+ eval_steps=None,
37
+ eval_strategy=IntervalStrategy.EPOCH,
38
+ evaluation_strategy=epoch,
39
+ fp16=False,
40
+ fp16_backend=auto,
41
+ fp16_full_eval=False,
42
+ fp16_opt_level=O1,
43
+ fsdp=[],
44
+ fsdp_config={'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False},
45
+ fsdp_min_num_params=0,
46
+ fsdp_transformer_layer_cls_to_wrap=None,
47
+ full_determinism=False,
48
+ generation_config=None,
49
+ generation_max_length=None,
50
+ generation_num_beams=None,
51
+ gradient_accumulation_steps=8,
52
+ gradient_checkpointing=True,
53
+ gradient_checkpointing_kwargs=None,
54
+ greater_is_better=None,
55
+ group_by_length=True,
56
+ half_precision_backend=auto,
57
+ hub_always_push=False,
58
+ hub_model_id=None,
59
+ hub_private_repo=False,
60
+ hub_strategy=HubStrategy.EVERY_SAVE,
61
+ hub_token=<HUB_TOKEN>,
62
+ ignore_data_skip=False,
63
+ include_inputs_for_metrics=True,
64
+ include_num_input_tokens_seen=False,
65
+ include_tokens_per_second=False,
66
+ jit_mode_eval=False,
67
+ label_names=None,
68
+ label_smoothing_factor=0.0,
69
+ learning_rate=8e-05,
70
+ length_column_name=length,
71
+ load_best_model_at_end=False,
72
+ local_rank=0,
73
+ log_level=passive,
74
+ log_level_replica=warning,
75
+ log_on_each_node=True,
76
+ logging_dir=../output_dir_training_constant_concat/runs/May13_20-46-51_hf-dgx-01,
77
+ logging_first_step=False,
78
+ logging_nan_inf_filter=True,
79
+ logging_steps=5,
80
+ logging_strategy=IntervalStrategy.STEPS,
81
+ lr_scheduler_kwargs={},
82
+ lr_scheduler_type=SchedulerType.COSINE,
83
+ max_grad_norm=1.0,
84
+ max_steps=-1,
85
+ metric_for_best_model=None,
86
+ mp_parameters=,
87
+ neftune_noise_alpha=None,
88
+ no_cuda=False,
89
+ num_train_epochs=8,
90
+ optim=OptimizerNames.ADAMW_TORCH,
91
+ optim_args=None,
92
+ optim_target_modules=None,
93
+ output_dir=../output_dir_training_constant_concat/,
94
+ overwrite_output_dir=True,
95
+ past_index=-1,
96
+ per_device_eval_batch_size=16,
97
+ per_device_train_batch_size=16,
98
+ predict_with_generate=True,
99
+ prediction_loss_only=False,
100
+ push_to_hub=False,
101
+ push_to_hub_model_id=None,
102
+ push_to_hub_organization=None,
103
+ push_to_hub_token=<PUSH_TO_HUB_TOKEN>,
104
+ ray_scope=last,
105
+ remove_unused_columns=True,
106
+ report_to=['wandb'],
107
+ restore_callback_states_from_checkpoint=False,
108
+ resume_from_checkpoint=None,
109
+ run_name=../output_dir_training_constant_concat/,
110
+ save_on_each_node=False,
111
+ save_only_model=False,
112
+ save_safetensors=True,
113
+ save_steps=500,
114
+ save_strategy=IntervalStrategy.EPOCH,
115
+ save_total_limit=5,
116
+ seed=456,
117
+ skip_memory_metrics=True,
118
+ sortish_sampler=False,
119
+ split_batches=None,
120
+ tf32=None,
121
+ torch_compile=False,
122
+ torch_compile_backend=None,
123
+ torch_compile_mode=None,
124
+ torchdynamo=None,
125
+ tpu_metrics_debug=False,
126
+ tpu_num_cores=None,
127
+ use_cpu=False,
128
+ use_ipex=False,
129
+ use_legacy_prediction_loop=False,
130
+ use_mps_device=False,
131
+ warmup_ratio=0.0,
132
+ warmup_steps=250,
133
+ weight_decay=0.01,
134
+ )
135
+ 05/13/2024 20:46:57 - WARNING - __main__ - Disabling fast tokenizer warning: https://github.com/huggingface/transformers/blob/main/src/transformers/tokenization_utils_base.py#L3231-L3235
136
+ loading configuration file preprocessor_config.json from cache at /raid/.cache/huggingface/models--parler-tts--dac_44khZ_8kbps/snapshots/db52bea859d9411e0beb44a3ea923a8731ee4197/preprocessor_config.json
137
+ Feature extractor EncodecFeatureExtractor {
138
+ "chunk_length_s": null,
139
+ "feature_extractor_type": "EncodecFeatureExtractor",
140
+ "feature_size": 1,
141
+ "overlap": null,
142
+ "padding_side": "right",
143
+ "padding_value": 0.0,
144
+ "return_attention_mask": true,
145
+ "sampling_rate": 44100
146
+ }
147
+ loading file spiece.model from cache at /raid/.cache/huggingface/models--parler-tts--parler_tts_mini_v0.1/snapshots/e02fd18e77d38b49a85c7a9a85189a64b8472544/spiece.model
148
+ loading file tokenizer.json from cache at /raid/.cache/huggingface/models--parler-tts--parler_tts_mini_v0.1/snapshots/e02fd18e77d38b49a85c7a9a85189a64b8472544/tokenizer.json
149
+ loading file added_tokens.json from cache at None
150
+ loading file special_tokens_map.json from cache at /raid/.cache/huggingface/models--parler-tts--parler_tts_mini_v0.1/snapshots/e02fd18e77d38b49a85c7a9a85189a64b8472544/special_tokens_map.json
151
+ loading file tokenizer_config.json from cache at /raid/.cache/huggingface/models--parler-tts--parler_tts_mini_v0.1/snapshots/e02fd18e77d38b49a85c7a9a85189a64b8472544/tokenizer_config.json
152
+ You set `add_prefix_space`. The tokenizer needs to be converted from the slow tokenizers
153
+ loading file spiece.model from cache at /raid/.cache/huggingface/models--parler-tts--parler_tts_mini_v0.1/snapshots/e02fd18e77d38b49a85c7a9a85189a64b8472544/spiece.model
154
+ loading file tokenizer.json from cache at /raid/.cache/huggingface/models--parler-tts--parler_tts_mini_v0.1/snapshots/e02fd18e77d38b49a85c7a9a85189a64b8472544/tokenizer.json
155
+ loading file added_tokens.json from cache at None
156
+ loading file special_tokens_map.json from cache at /raid/.cache/huggingface/models--parler-tts--parler_tts_mini_v0.1/snapshots/e02fd18e77d38b49a85c7a9a85189a64b8472544/special_tokens_map.json
157
+ loading file tokenizer_config.json from cache at /raid/.cache/huggingface/models--parler-tts--parler_tts_mini_v0.1/snapshots/e02fd18e77d38b49a85c7a9a85189a64b8472544/tokenizer_config.json
158
+ Combining datasets...: 0%| | 0/4 [00:00<?, ?it/s]
159
+ Combining datasets...: 0%| | 0/4 [03:35<?, ?it/s]
160
+ Traceback (most recent call last):
161
+ File "/raid/sanchit/parler-tts-mini-v0.1-expresso-concatenated-combined/run_parler_tts_training.py", line 1763, in <module>
162
+ main()
163
+ File "/raid/sanchit/parler-tts-mini-v0.1-expresso-concatenated-combined/run_parler_tts_training.py", line 950, in main
164
+ raw_datasets["train"] = load_multiple_datasets(
165
+ ^^^^^^^^^^^^^^^^^^^^^^^
166
+ File "/raid/sanchit/parler-tts-mini-v0.1-expresso-concatenated-combined/run_parler_tts_training.py", line 693, in load_multiple_datasets
167
+ metadata_dataset = load_dataset(
168
+ ^^^^^^^^^^^^^
169
+ File "/home/sanchit/miniconda3/envs/venv/lib/python3.11/site-packages/datasets/load.py", line 2587, in load_dataset
170
+ builder_instance = load_dataset_builder(
171
+ ^^^^^^^^^^^^^^^^^^^^^
172
+ File "/home/sanchit/miniconda3/envs/venv/lib/python3.11/site-packages/datasets/load.py", line 2296, in load_dataset_builder
173
+ builder_instance: DatasetBuilder = builder_cls(
174
+ ^^^^^^^^^^^^
175
+ File "/home/sanchit/miniconda3/envs/venv/lib/python3.11/site-packages/datasets/builder.py", line 374, in __init__
176
+ self.config, self.config_id = self._create_builder_config(
177
+ ^^^^^^^^^^^^^^^^^^^^^^^^^^^^
178
+ File "/home/sanchit/miniconda3/envs/venv/lib/python3.11/site-packages/datasets/builder.py", line 599, in _create_builder_config
179
+ raise ValueError(
180
+ ValueError: BuilderConfig 'read' not found. Available: ['default']
wandb/run-20240513_204652-m0g0ap7d/files/requirements.txt ADDED
@@ -0,0 +1,225 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Babel==2.15.0
2
+ Flask-Cors==4.0.1
3
+ Flask-RESTful==0.3.10
4
+ Flask-SocketIO==5.3.6
5
+ Flask==2.2.5
6
+ GitPython==3.1.43
7
+ Jinja2==3.1.4
8
+ Markdown==3.6
9
+ MarkupSafe==2.1.5
10
+ PyYAML==6.0.1
11
+ Pygments==2.18.0
12
+ Send2Trash==1.8.3
13
+ Werkzeug==3.0.3
14
+ absl-py==2.1.0
15
+ accelerate==0.30.0
16
+ aiohttp==3.9.5
17
+ aiosignal==1.3.1
18
+ aniso8601==9.0.1
19
+ annotated-types==0.6.0
20
+ anyio==4.3.0
21
+ argbind==0.3.7
22
+ argon2-cffi-bindings==21.2.0
23
+ argon2-cffi==23.1.0
24
+ arrow==1.3.0
25
+ asttokens==2.4.1
26
+ async-lru==2.0.4
27
+ attrs==23.2.0
28
+ audioread==3.0.1
29
+ beautifulsoup4==4.12.3
30
+ bidict==0.23.1
31
+ bitsandbytes==0.43.1
32
+ bleach==6.1.0
33
+ certifi==2024.2.2
34
+ cffi==1.16.0
35
+ charset-normalizer==3.3.2
36
+ click==8.1.7
37
+ coloredlogs==14.0
38
+ comm==0.2.2
39
+ contourpy==1.2.1
40
+ cycler==0.12.1
41
+ datasets==2.19.1
42
+ debugpy==1.8.1
43
+ decorator==5.1.1
44
+ defusedxml==0.7.1
45
+ descript-audio-codec==1.0.0
46
+ descript-audiotools==0.7.2
47
+ dill==0.3.8
48
+ dnspython==2.3.0
49
+ docker-pycreds==0.4.0
50
+ docstring_parser==0.16
51
+ editdistance==0.8.1
52
+ einops==0.8.0
53
+ et-xmlfile==1.1.0
54
+ evaluate==0.4.2
55
+ eventlet==0.36.1
56
+ executing==2.0.1
57
+ fastjsonschema==2.19.1
58
+ ffmpy==0.3.2
59
+ filelock==3.14.0
60
+ fire==0.6.0
61
+ flask-talisman==1.1.0
62
+ flatten-dict==0.4.2
63
+ fonttools==4.51.0
64
+ fqdn==1.5.1
65
+ frozenlist==1.4.1
66
+ fsspec==2024.3.1
67
+ future==1.0.0
68
+ g2p==2.0.0
69
+ gitdb==4.0.11
70
+ greenlet==3.0.3
71
+ grpcio==1.63.0
72
+ h11==0.14.0
73
+ httpcore==1.0.5
74
+ httpx==0.27.0
75
+ huggingface-hub==0.23.0
76
+ humanfriendly==10.0
77
+ idna==3.7
78
+ importlib_resources==6.4.0
79
+ ipdb==0.13.13
80
+ ipykernel==6.29.4
81
+ ipython==8.24.0
82
+ isoduration==20.11.0
83
+ itsdangerous==2.2.0
84
+ jedi==0.19.1
85
+ jiwer==3.0.4
86
+ joblib==1.4.2
87
+ json5==0.9.25
88
+ jsonpointer==2.4
89
+ jsonschema-specifications==2023.12.1
90
+ jsonschema==4.22.0
91
+ julius==0.2.7
92
+ jupyter-events==0.10.0
93
+ jupyter-lsp==2.2.5
94
+ jupyter_client==8.6.1
95
+ jupyter_core==5.7.2
96
+ jupyter_server==2.14.0
97
+ jupyter_server_terminals==0.5.3
98
+ jupyterlab==4.2.0
99
+ jupyterlab_pygments==0.3.0
100
+ jupyterlab_server==2.27.1
101
+ kiwisolver==1.4.5
102
+ lazy_loader==0.4
103
+ librosa==0.10.2
104
+ llvmlite==0.42.0
105
+ markdown-it-py==3.0.0
106
+ markdown2==2.4.13
107
+ matplotlib-inline==0.1.7
108
+ matplotlib==3.8.4
109
+ mdurl==0.1.2
110
+ mistune==3.0.2
111
+ mpmath==1.3.0
112
+ msgpack==1.0.8
113
+ multidict==6.0.5
114
+ multiprocess==0.70.16
115
+ munkres==1.1.4
116
+ nbclient==0.10.0
117
+ nbconvert==7.16.4
118
+ nbformat==5.10.4
119
+ nest-asyncio==1.6.0
120
+ networkx==3.3
121
+ notebook_shim==0.2.4
122
+ numba==0.59.1
123
+ numpy==1.26.4
124
+ nvidia-cublas-cu12==12.1.3.1
125
+ nvidia-cuda-cupti-cu12==12.1.105
126
+ nvidia-cuda-nvrtc-cu12==12.1.105
127
+ nvidia-cuda-runtime-cu12==12.1.105
128
+ nvidia-cudnn-cu12==8.9.2.26
129
+ nvidia-cufft-cu12==11.0.2.54
130
+ nvidia-curand-cu12==10.3.2.106
131
+ nvidia-cusolver-cu12==11.4.5.107
132
+ nvidia-cusparse-cu12==12.1.0.106
133
+ nvidia-nccl-cu12==2.20.5
134
+ nvidia-nvjitlink-cu12==12.4.127
135
+ nvidia-nvtx-cu12==12.1.105
136
+ openpyxl==3.1.2
137
+ overrides==7.7.0
138
+ packaging==24.0
139
+ pandas==2.2.2
140
+ pandocfilters==1.5.1
141
+ panphon==0.20.0
142
+ parler_tts==0.1
143
+ parso==0.8.4
144
+ pexpect==4.9.0
145
+ pillow==10.3.0
146
+ pip==24.0
147
+ platformdirs==4.2.1
148
+ pooch==1.8.1
149
+ prometheus_client==0.20.0
150
+ prompt-toolkit==3.0.43
151
+ protobuf==3.19.6
152
+ psutil==5.9.8
153
+ ptyprocess==0.7.0
154
+ pure-eval==0.2.2
155
+ pyarrow-hotfix==0.6
156
+ pyarrow==16.0.0
157
+ pycparser==2.22
158
+ pydantic==2.7.1
159
+ pydantic_core==2.18.2
160
+ pyloudnorm==0.1.1
161
+ pyparsing==3.1.2
162
+ pystoi==0.4.1
163
+ python-dateutil==2.9.0.post0
164
+ python-engineio==4.9.0
165
+ python-json-logger==2.0.7
166
+ python-socketio==5.11.2
167
+ pytz==2024.1
168
+ pyzmq==26.0.3
169
+ randomname==0.2.1
170
+ rapidfuzz==3.9.0
171
+ referencing==0.35.1
172
+ regex==2024.4.28
173
+ requests==2.31.0
174
+ rfc3339-validator==0.1.4
175
+ rfc3986-validator==0.1.1
176
+ rich==13.7.1
177
+ rpds-py==0.18.1
178
+ safetensors==0.4.3
179
+ scikit-learn==1.4.2
180
+ scipy==1.13.0
181
+ sentencepiece==0.2.0
182
+ sentry-sdk==2.1.1
183
+ setproctitle==1.3.3
184
+ setuptools==69.5.1
185
+ simple-websocket==1.0.0
186
+ six==1.16.0
187
+ smmap==5.0.1
188
+ sniffio==1.3.1
189
+ soundfile==0.12.1
190
+ soupsieve==2.5
191
+ soxr==0.3.7
192
+ stack-data==0.6.3
193
+ sympy==1.12
194
+ tensorboard-data-server==0.7.2
195
+ tensorboard==2.16.2
196
+ termcolor==2.4.0
197
+ terminado==0.18.1
198
+ text-unidecode==1.3
199
+ threadpoolctl==3.5.0
200
+ tinycss2==1.3.0
201
+ tokenizers==0.19.1
202
+ torch-stoi==0.2.1
203
+ torch==2.3.0
204
+ torchaudio==2.3.0
205
+ tornado==6.4
206
+ tqdm==4.66.4
207
+ traitlets==5.14.3
208
+ transformers==4.41.0.dev0
209
+ transformers==4.41.0.dev0
210
+ triton==2.3.0
211
+ types-python-dateutil==2.9.0.20240316
212
+ typing_extensions==4.11.0
213
+ tzdata==2024.1
214
+ unicodecsv==0.14.1
215
+ uri-template==1.3.0
216
+ urllib3==2.2.1
217
+ wandb==0.17.0
218
+ wcwidth==0.2.13
219
+ webcolors==1.13
220
+ webencodings==0.5.1
221
+ websocket-client==1.8.0
222
+ wheel==0.43.0
223
+ wsproto==1.2.0
224
+ xxhash==3.4.1
225
+ yarl==1.9.4
wandb/run-20240513_204652-m0g0ap7d/files/wandb-metadata.json ADDED
@@ -0,0 +1,706 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "os": "Linux-5.4.0-166-generic-x86_64-with-glibc2.31",
3
+ "python": "3.11.9",
4
+ "heartbeatAt": "2024-05-13T18:46:53.365083",
5
+ "startedAt": "2024-05-13T18:46:52.816759",
6
+ "docker": null,
7
+ "cuda": null,
8
+ "args": [
9
+ "finetuning_concatenated_config.json"
10
+ ],
11
+ "state": "running",
12
+ "program": "/raid/sanchit/parler-tts-mini-v0.1-expresso-concatenated-combined/run_parler_tts_training.py",
13
+ "codePathLocal": "run_parler_tts_training.py",
14
+ "codePath": "run_parler_tts_training.py",
15
+ "git": {
16
+ "remote": "https://huggingface.co/sanchit-gandhi/parler-tts-mini-v0.1-expresso-concatenated-combined",
17
+ "commit": "50ba4323d7b8bb052629aa1b88283b9df081a821"
18
+ },
19
+ "email": "[email protected]",
20
+ "root": "/raid/sanchit/parler-tts-mini-v0.1-expresso-concatenated-combined",
21
+ "host": "hf-dgx-01",
22
+ "username": "sanchit",
23
+ "executable": "/home/sanchit/miniconda3/envs/venv/bin/python",
24
+ "cpu_count": 64,
25
+ "cpu_count_logical": 128,
26
+ "cpu_freq": {
27
+ "current": 2257.736234375,
28
+ "min": 1500.0,
29
+ "max": 2250.0
30
+ },
31
+ "cpu_freq_per_core": [
32
+ {
33
+ "current": 1795.281,
34
+ "min": 1500.0,
35
+ "max": 2250.0
36
+ },
37
+ {
38
+ "current": 1794.292,
39
+ "min": 1500.0,
40
+ "max": 2250.0
41
+ },
42
+ {
43
+ "current": 1795.78,
44
+ "min": 1500.0,
45
+ "max": 2250.0
46
+ },
47
+ {
48
+ "current": 1792.55,
49
+ "min": 1500.0,
50
+ "max": 2250.0
51
+ },
52
+ {
53
+ "current": 1742.094,
54
+ "min": 1500.0,
55
+ "max": 2250.0
56
+ },
57
+ {
58
+ "current": 3026.54,
59
+ "min": 1500.0,
60
+ "max": 2250.0
61
+ },
62
+ {
63
+ "current": 1786.214,
64
+ "min": 1500.0,
65
+ "max": 2250.0
66
+ },
67
+ {
68
+ "current": 1742.547,
69
+ "min": 1500.0,
70
+ "max": 2250.0
71
+ },
72
+ {
73
+ "current": 1728.916,
74
+ "min": 1500.0,
75
+ "max": 2250.0
76
+ },
77
+ {
78
+ "current": 1734.023,
79
+ "min": 1500.0,
80
+ "max": 2250.0
81
+ },
82
+ {
83
+ "current": 3195.219,
84
+ "min": 1500.0,
85
+ "max": 2250.0
86
+ },
87
+ {
88
+ "current": 1733.722,
89
+ "min": 1500.0,
90
+ "max": 2250.0
91
+ },
92
+ {
93
+ "current": 3341.911,
94
+ "min": 1500.0,
95
+ "max": 2250.0
96
+ },
97
+ {
98
+ "current": 3325.608,
99
+ "min": 1500.0,
100
+ "max": 2250.0
101
+ },
102
+ {
103
+ "current": 3233.258,
104
+ "min": 1500.0,
105
+ "max": 2250.0
106
+ },
107
+ {
108
+ "current": 1669.847,
109
+ "min": 1500.0,
110
+ "max": 2250.0
111
+ },
112
+ {
113
+ "current": 1791.948,
114
+ "min": 1500.0,
115
+ "max": 2250.0
116
+ },
117
+ {
118
+ "current": 1796.794,
119
+ "min": 1500.0,
120
+ "max": 2250.0
121
+ },
122
+ {
123
+ "current": 1791.49,
124
+ "min": 1500.0,
125
+ "max": 2250.0
126
+ },
127
+ {
128
+ "current": 1793.945,
129
+ "min": 1500.0,
130
+ "max": 2250.0
131
+ },
132
+ {
133
+ "current": 3342.943,
134
+ "min": 1500.0,
135
+ "max": 2250.0
136
+ },
137
+ {
138
+ "current": 1669.791,
139
+ "min": 1500.0,
140
+ "max": 2250.0
141
+ },
142
+ {
143
+ "current": 1669.593,
144
+ "min": 1500.0,
145
+ "max": 2250.0
146
+ },
147
+ {
148
+ "current": 1694.312,
149
+ "min": 1500.0,
150
+ "max": 2250.0
151
+ },
152
+ {
153
+ "current": 1873.727,
154
+ "min": 1500.0,
155
+ "max": 2250.0
156
+ },
157
+ {
158
+ "current": 1724.813,
159
+ "min": 1500.0,
160
+ "max": 2250.0
161
+ },
162
+ {
163
+ "current": 2354.471,
164
+ "min": 1500.0,
165
+ "max": 2250.0
166
+ },
167
+ {
168
+ "current": 1718.662,
169
+ "min": 1500.0,
170
+ "max": 2250.0
171
+ },
172
+ {
173
+ "current": 1670.588,
174
+ "min": 1500.0,
175
+ "max": 2250.0
176
+ },
177
+ {
178
+ "current": 1665.577,
179
+ "min": 1500.0,
180
+ "max": 2250.0
181
+ },
182
+ {
183
+ "current": 1616.671,
184
+ "min": 1500.0,
185
+ "max": 2250.0
186
+ },
187
+ {
188
+ "current": 2080.81,
189
+ "min": 1500.0,
190
+ "max": 2250.0
191
+ },
192
+ {
193
+ "current": 1670.666,
194
+ "min": 1500.0,
195
+ "max": 2250.0
196
+ },
197
+ {
198
+ "current": 1652.559,
199
+ "min": 1500.0,
200
+ "max": 2250.0
201
+ },
202
+ {
203
+ "current": 3323.654,
204
+ "min": 1500.0,
205
+ "max": 2250.0
206
+ },
207
+ {
208
+ "current": 1671.311,
209
+ "min": 1500.0,
210
+ "max": 2250.0
211
+ },
212
+ {
213
+ "current": 1726.286,
214
+ "min": 1500.0,
215
+ "max": 2250.0
216
+ },
217
+ {
218
+ "current": 1670.365,
219
+ "min": 1500.0,
220
+ "max": 2250.0
221
+ },
222
+ {
223
+ "current": 3320.57,
224
+ "min": 1500.0,
225
+ "max": 2250.0
226
+ },
227
+ {
228
+ "current": 1669.941,
229
+ "min": 1500.0,
230
+ "max": 2250.0
231
+ },
232
+ {
233
+ "current": 1791.021,
234
+ "min": 1500.0,
235
+ "max": 2250.0
236
+ },
237
+ {
238
+ "current": 1796.246,
239
+ "min": 1500.0,
240
+ "max": 2250.0
241
+ },
242
+ {
243
+ "current": 1793.946,
244
+ "min": 1500.0,
245
+ "max": 2250.0
246
+ },
247
+ {
248
+ "current": 1794.848,
249
+ "min": 1500.0,
250
+ "max": 2250.0
251
+ },
252
+ {
253
+ "current": 3339.327,
254
+ "min": 1500.0,
255
+ "max": 2250.0
256
+ },
257
+ {
258
+ "current": 3344.315,
259
+ "min": 1500.0,
260
+ "max": 2250.0
261
+ },
262
+ {
263
+ "current": 3338.901,
264
+ "min": 1500.0,
265
+ "max": 2250.0
266
+ },
267
+ {
268
+ "current": 1668.541,
269
+ "min": 1500.0,
270
+ "max": 2250.0
271
+ },
272
+ {
273
+ "current": 1794.526,
274
+ "min": 1500.0,
275
+ "max": 2250.0
276
+ },
277
+ {
278
+ "current": 1792.886,
279
+ "min": 1500.0,
280
+ "max": 2250.0
281
+ },
282
+ {
283
+ "current": 1796.844,
284
+ "min": 1500.0,
285
+ "max": 2250.0
286
+ },
287
+ {
288
+ "current": 1793.81,
289
+ "min": 1500.0,
290
+ "max": 2250.0
291
+ },
292
+ {
293
+ "current": 1724.861,
294
+ "min": 1500.0,
295
+ "max": 2250.0
296
+ },
297
+ {
298
+ "current": 2294.458,
299
+ "min": 1500.0,
300
+ "max": 2250.0
301
+ },
302
+ {
303
+ "current": 1720.835,
304
+ "min": 1500.0,
305
+ "max": 2250.0
306
+ },
307
+ {
308
+ "current": 1720.155,
309
+ "min": 1500.0,
310
+ "max": 2250.0
311
+ },
312
+ {
313
+ "current": 1668.96,
314
+ "min": 1500.0,
315
+ "max": 2250.0
316
+ },
317
+ {
318
+ "current": 1976.5,
319
+ "min": 1500.0,
320
+ "max": 2250.0
321
+ },
322
+ {
323
+ "current": 2241.578,
324
+ "min": 1500.0,
325
+ "max": 2250.0
326
+ },
327
+ {
328
+ "current": 1671.964,
329
+ "min": 1500.0,
330
+ "max": 2250.0
331
+ },
332
+ {
333
+ "current": 3319.623,
334
+ "min": 1500.0,
335
+ "max": 2250.0
336
+ },
337
+ {
338
+ "current": 1670.777,
339
+ "min": 1500.0,
340
+ "max": 2250.0
341
+ },
342
+ {
343
+ "current": 1670.389,
344
+ "min": 1500.0,
345
+ "max": 2250.0
346
+ },
347
+ {
348
+ "current": 1669.629,
349
+ "min": 1500.0,
350
+ "max": 2250.0
351
+ },
352
+ {
353
+ "current": 1794.19,
354
+ "min": 1500.0,
355
+ "max": 2250.0
356
+ },
357
+ {
358
+ "current": 1794.138,
359
+ "min": 1500.0,
360
+ "max": 2250.0
361
+ },
362
+ {
363
+ "current": 1796.317,
364
+ "min": 1500.0,
365
+ "max": 2250.0
366
+ },
367
+ {
368
+ "current": 1792.821,
369
+ "min": 1500.0,
370
+ "max": 2250.0
371
+ },
372
+ {
373
+ "current": 1794.716,
374
+ "min": 1500.0,
375
+ "max": 2250.0
376
+ },
377
+ {
378
+ "current": 1793.624,
379
+ "min": 1500.0,
380
+ "max": 2250.0
381
+ },
382
+ {
383
+ "current": 1796.346,
384
+ "min": 1500.0,
385
+ "max": 2250.0
386
+ },
387
+ {
388
+ "current": 1793.897,
389
+ "min": 1500.0,
390
+ "max": 2250.0
391
+ },
392
+ {
393
+ "current": 1735.424,
394
+ "min": 1500.0,
395
+ "max": 2250.0
396
+ },
397
+ {
398
+ "current": 1738.64,
399
+ "min": 1500.0,
400
+ "max": 2250.0
401
+ },
402
+ {
403
+ "current": 1979.998,
404
+ "min": 1500.0,
405
+ "max": 2250.0
406
+ },
407
+ {
408
+ "current": 1737.286,
409
+ "min": 1500.0,
410
+ "max": 2250.0
411
+ },
412
+ {
413
+ "current": 3313.748,
414
+ "min": 1500.0,
415
+ "max": 2250.0
416
+ },
417
+ {
418
+ "current": 3337.223,
419
+ "min": 1500.0,
420
+ "max": 2250.0
421
+ },
422
+ {
423
+ "current": 1671.416,
424
+ "min": 1500.0,
425
+ "max": 2250.0
426
+ },
427
+ {
428
+ "current": 1670.005,
429
+ "min": 1500.0,
430
+ "max": 2250.0
431
+ },
432
+ {
433
+ "current": 1794.276,
434
+ "min": 1500.0,
435
+ "max": 2250.0
436
+ },
437
+ {
438
+ "current": 1738.22,
439
+ "min": 1500.0,
440
+ "max": 2250.0
441
+ },
442
+ {
443
+ "current": 1742.737,
444
+ "min": 1500.0,
445
+ "max": 2250.0
446
+ },
447
+ {
448
+ "current": 1770.535,
449
+ "min": 1500.0,
450
+ "max": 2250.0
451
+ },
452
+ {
453
+ "current": 3320.252,
454
+ "min": 1500.0,
455
+ "max": 2250.0
456
+ },
457
+ {
458
+ "current": 1671.037,
459
+ "min": 1500.0,
460
+ "max": 2250.0
461
+ },
462
+ {
463
+ "current": 1669.549,
464
+ "min": 1500.0,
465
+ "max": 2250.0
466
+ },
467
+ {
468
+ "current": 1670.948,
469
+ "min": 1500.0,
470
+ "max": 2250.0
471
+ },
472
+ {
473
+ "current": 2843.391,
474
+ "min": 1500.0,
475
+ "max": 2250.0
476
+ },
477
+ {
478
+ "current": 2348.589,
479
+ "min": 1500.0,
480
+ "max": 2250.0
481
+ },
482
+ {
483
+ "current": 3287.915,
484
+ "min": 1500.0,
485
+ "max": 2250.0
486
+ },
487
+ {
488
+ "current": 2340.192,
489
+ "min": 1500.0,
490
+ "max": 2250.0
491
+ },
492
+ {
493
+ "current": 2426.358,
494
+ "min": 1500.0,
495
+ "max": 2250.0
496
+ },
497
+ {
498
+ "current": 2415.833,
499
+ "min": 1500.0,
500
+ "max": 2250.0
501
+ },
502
+ {
503
+ "current": 2419.416,
504
+ "min": 1500.0,
505
+ "max": 2250.0
506
+ },
507
+ {
508
+ "current": 2277.433,
509
+ "min": 1500.0,
510
+ "max": 2250.0
511
+ },
512
+ {
513
+ "current": 2365.562,
514
+ "min": 1500.0,
515
+ "max": 2250.0
516
+ },
517
+ {
518
+ "current": 2400.6,
519
+ "min": 1500.0,
520
+ "max": 2250.0
521
+ },
522
+ {
523
+ "current": 2075.143,
524
+ "min": 1500.0,
525
+ "max": 2250.0
526
+ },
527
+ {
528
+ "current": 2382.295,
529
+ "min": 1500.0,
530
+ "max": 2250.0
531
+ },
532
+ {
533
+ "current": 3066.339,
534
+ "min": 1500.0,
535
+ "max": 2250.0
536
+ },
537
+ {
538
+ "current": 2466.631,
539
+ "min": 1500.0,
540
+ "max": 2250.0
541
+ },
542
+ {
543
+ "current": 3100.81,
544
+ "min": 1500.0,
545
+ "max": 2250.0
546
+ },
547
+ {
548
+ "current": 2421.93,
549
+ "min": 1500.0,
550
+ "max": 2250.0
551
+ },
552
+ {
553
+ "current": 3233.829,
554
+ "min": 1500.0,
555
+ "max": 2250.0
556
+ },
557
+ {
558
+ "current": 2234.583,
559
+ "min": 1500.0,
560
+ "max": 2250.0
561
+ },
562
+ {
563
+ "current": 2452.089,
564
+ "min": 1500.0,
565
+ "max": 2250.0
566
+ },
567
+ {
568
+ "current": 2975.985,
569
+ "min": 1500.0,
570
+ "max": 2250.0
571
+ },
572
+ {
573
+ "current": 3301.512,
574
+ "min": 1500.0,
575
+ "max": 2250.0
576
+ },
577
+ {
578
+ "current": 3336.905,
579
+ "min": 1500.0,
580
+ "max": 2250.0
581
+ },
582
+ {
583
+ "current": 2984.87,
584
+ "min": 1500.0,
585
+ "max": 2250.0
586
+ },
587
+ {
588
+ "current": 2384.306,
589
+ "min": 1500.0,
590
+ "max": 2250.0
591
+ },
592
+ {
593
+ "current": 2965.197,
594
+ "min": 1500.0,
595
+ "max": 2250.0
596
+ },
597
+ {
598
+ "current": 1929.067,
599
+ "min": 1500.0,
600
+ "max": 2250.0
601
+ },
602
+ {
603
+ "current": 1986.731,
604
+ "min": 1500.0,
605
+ "max": 2250.0
606
+ },
607
+ {
608
+ "current": 1999.412,
609
+ "min": 1500.0,
610
+ "max": 2250.0
611
+ },
612
+ {
613
+ "current": 2477.541,
614
+ "min": 1500.0,
615
+ "max": 2250.0
616
+ },
617
+ {
618
+ "current": 3111.851,
619
+ "min": 1500.0,
620
+ "max": 2250.0
621
+ },
622
+ {
623
+ "current": 2009.907,
624
+ "min": 1500.0,
625
+ "max": 2250.0
626
+ },
627
+ {
628
+ "current": 1993.784,
629
+ "min": 1500.0,
630
+ "max": 2250.0
631
+ },
632
+ {
633
+ "current": 2144.459,
634
+ "min": 1500.0,
635
+ "max": 2250.0
636
+ },
637
+ {
638
+ "current": 3337.426,
639
+ "min": 1500.0,
640
+ "max": 2250.0
641
+ },
642
+ {
643
+ "current": 3320.114,
644
+ "min": 1500.0,
645
+ "max": 2250.0
646
+ },
647
+ {
648
+ "current": 2169.719,
649
+ "min": 1500.0,
650
+ "max": 2250.0
651
+ },
652
+ {
653
+ "current": 3308.644,
654
+ "min": 1500.0,
655
+ "max": 2250.0
656
+ },
657
+ {
658
+ "current": 2111.633,
659
+ "min": 1500.0,
660
+ "max": 2250.0
661
+ },
662
+ {
663
+ "current": 2123.71,
664
+ "min": 1500.0,
665
+ "max": 2250.0
666
+ },
667
+ {
668
+ "current": 2153.49,
669
+ "min": 1500.0,
670
+ "max": 2250.0
671
+ }
672
+ ],
673
+ "disk": {
674
+ "/": {
675
+ "total": 1757.8785285949707,
676
+ "used": 1663.5005989074707
677
+ }
678
+ },
679
+ "gpu": "NVIDIA A100-SXM4-80GB",
680
+ "gpu_count": 5,
681
+ "gpu_devices": [
682
+ {
683
+ "name": "NVIDIA A100-SXM4-80GB",
684
+ "memory_total": 85899345920
685
+ },
686
+ {
687
+ "name": "NVIDIA A100-SXM4-80GB",
688
+ "memory_total": 85899345920
689
+ },
690
+ {
691
+ "name": "NVIDIA A100-SXM4-80GB",
692
+ "memory_total": 85899345920
693
+ },
694
+ {
695
+ "name": "NVIDIA DGX Display",
696
+ "memory_total": 4294967296
697
+ },
698
+ {
699
+ "name": "NVIDIA A100-SXM4-80GB",
700
+ "memory_total": 85899345920
701
+ }
702
+ ],
703
+ "memory": {
704
+ "total": 503.5396919250488
705
+ }
706
+ }
wandb/run-20240513_204652-m0g0ap7d/files/wandb-summary.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"_wandb": {"runtime": 219}}
wandb/run-20240513_204652-m0g0ap7d/logs/debug-internal.log ADDED
@@ -0,0 +1,455 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ 2024-05-13 20:46:52,823 INFO StreamThr :1244775 [internal.py:wandb_internal():85] W&B internal server running at pid: 1244775, started at: 2024-05-13 20:46:52.823043
2
+ 2024-05-13 20:46:52,825 DEBUG HandlerThread:1244775 [handler.py:handle_request():158] handle_request: status
3
+ 2024-05-13 20:46:52,826 INFO WriterThread:1244775 [datastore.py:open_for_write():87] open: /raid/sanchit/parler-tts-mini-v0.1-expresso-concatenated-combined/wandb/run-20240513_204652-m0g0ap7d/run-m0g0ap7d.wandb
4
+ 2024-05-13 20:46:52,828 DEBUG SenderThread:1244775 [sender.py:send():378] send: header
5
+ 2024-05-13 20:46:52,829 DEBUG SenderThread:1244775 [sender.py:send():378] send: run
6
+ 2024-05-13 20:46:53,223 INFO SenderThread:1244775 [dir_watcher.py:__init__():211] watching files in: /raid/sanchit/parler-tts-mini-v0.1-expresso-concatenated-combined/wandb/run-20240513_204652-m0g0ap7d/files
7
+ 2024-05-13 20:46:53,223 INFO SenderThread:1244775 [sender.py:_start_run_threads():1123] run started: m0g0ap7d with start time 1715626012.822173
8
+ 2024-05-13 20:46:53,229 DEBUG HandlerThread:1244775 [handler.py:handle_request():158] handle_request: check_version
9
+ 2024-05-13 20:46:53,229 DEBUG SenderThread:1244775 [sender.py:send_request():405] send_request: check_version
10
+ 2024-05-13 20:46:53,293 DEBUG HandlerThread:1244775 [handler.py:handle_request():158] handle_request: run_start
11
+ 2024-05-13 20:46:53,323 DEBUG HandlerThread:1244775 [system_info.py:__init__():26] System info init
12
+ 2024-05-13 20:46:53,323 DEBUG HandlerThread:1244775 [system_info.py:__init__():41] System info init done
13
+ 2024-05-13 20:46:53,323 INFO HandlerThread:1244775 [system_monitor.py:start():194] Starting system monitor
14
+ 2024-05-13 20:46:53,323 INFO SystemMonitor:1244775 [system_monitor.py:_start():158] Starting system asset monitoring threads
15
+ 2024-05-13 20:46:53,324 INFO HandlerThread:1244775 [system_monitor.py:probe():214] Collecting system info
16
+ 2024-05-13 20:46:53,324 INFO SystemMonitor:1244775 [interfaces.py:start():188] Started cpu monitoring
17
+ 2024-05-13 20:46:53,325 INFO SystemMonitor:1244775 [interfaces.py:start():188] Started disk monitoring
18
+ 2024-05-13 20:46:53,325 INFO SystemMonitor:1244775 [interfaces.py:start():188] Started gpu monitoring
19
+ 2024-05-13 20:46:53,327 INFO SystemMonitor:1244775 [interfaces.py:start():188] Started memory monitoring
20
+ 2024-05-13 20:46:53,329 INFO SystemMonitor:1244775 [interfaces.py:start():188] Started network monitoring
21
+ 2024-05-13 20:46:53,365 DEBUG HandlerThread:1244775 [system_info.py:probe():150] Probing system
22
+ 2024-05-13 20:46:53,366 DEBUG HandlerThread:1244775 [system_info.py:_probe_git():135] Probing git
23
+ 2024-05-13 20:46:53,372 DEBUG HandlerThread:1244775 [system_info.py:_probe_git():143] Probing git done
24
+ 2024-05-13 20:46:53,372 DEBUG HandlerThread:1244775 [system_info.py:probe():198] Probing system done
25
+ 2024-05-13 20:46:53,372 DEBUG HandlerThread:1244775 [system_monitor.py:probe():223] {'os': 'Linux-5.4.0-166-generic-x86_64-with-glibc2.31', 'python': '3.11.9', 'heartbeatAt': '2024-05-13T18:46:53.365083', 'startedAt': '2024-05-13T18:46:52.816759', 'docker': None, 'cuda': None, 'args': ('finetuning_concatenated_config.json',), 'state': 'running', 'program': '/raid/sanchit/parler-tts-mini-v0.1-expresso-concatenated-combined/run_parler_tts_training.py', 'codePathLocal': 'run_parler_tts_training.py', 'codePath': 'run_parler_tts_training.py', 'git': {'remote': 'https://huggingface.co/sanchit-gandhi/parler-tts-mini-v0.1-expresso-concatenated-combined', 'commit': '50ba4323d7b8bb052629aa1b88283b9df081a821'}, 'email': '[email protected]', 'root': '/raid/sanchit/parler-tts-mini-v0.1-expresso-concatenated-combined', 'host': 'hf-dgx-01', 'username': 'sanchit', 'executable': '/home/sanchit/miniconda3/envs/venv/bin/python', 'cpu_count': 64, 'cpu_count_logical': 128, 'cpu_freq': {'current': 2257.736234375, 'min': 1500.0, 'max': 2250.0}, 'cpu_freq_per_core': [{'current': 1795.281, 'min': 1500.0, 'max': 2250.0}, {'current': 1794.292, 'min': 1500.0, 'max': 2250.0}, {'current': 1795.78, 'min': 1500.0, 'max': 2250.0}, {'current': 1792.55, 'min': 1500.0, 'max': 2250.0}, {'current': 1742.094, 'min': 1500.0, 'max': 2250.0}, {'current': 3026.54, 'min': 1500.0, 'max': 2250.0}, {'current': 1786.214, 'min': 1500.0, 'max': 2250.0}, {'current': 1742.547, 'min': 1500.0, 'max': 2250.0}, {'current': 1728.916, 'min': 1500.0, 'max': 2250.0}, {'current': 1734.023, 'min': 1500.0, 'max': 2250.0}, {'current': 3195.219, 'min': 1500.0, 'max': 2250.0}, {'current': 1733.722, 'min': 1500.0, 'max': 2250.0}, {'current': 3341.911, 'min': 1500.0, 'max': 2250.0}, {'current': 3325.608, 'min': 1500.0, 'max': 2250.0}, {'current': 3233.258, 'min': 1500.0, 'max': 2250.0}, {'current': 1669.847, 'min': 1500.0, 'max': 2250.0}, {'current': 1791.948, 'min': 1500.0, 'max': 2250.0}, {'current': 1796.794, 'min': 1500.0, 'max': 2250.0}, {'current': 1791.49, 'min': 1500.0, 'max': 2250.0}, {'current': 1793.945, 'min': 1500.0, 'max': 2250.0}, {'current': 3342.943, 'min': 1500.0, 'max': 2250.0}, {'current': 1669.791, 'min': 1500.0, 'max': 2250.0}, {'current': 1669.593, 'min': 1500.0, 'max': 2250.0}, {'current': 1694.312, 'min': 1500.0, 'max': 2250.0}, {'current': 1873.727, 'min': 1500.0, 'max': 2250.0}, {'current': 1724.813, 'min': 1500.0, 'max': 2250.0}, {'current': 2354.471, 'min': 1500.0, 'max': 2250.0}, {'current': 1718.662, 'min': 1500.0, 'max': 2250.0}, {'current': 1670.588, 'min': 1500.0, 'max': 2250.0}, {'current': 1665.577, 'min': 1500.0, 'max': 2250.0}, {'current': 1616.671, 'min': 1500.0, 'max': 2250.0}, {'current': 2080.81, 'min': 1500.0, 'max': 2250.0}, {'current': 1670.666, 'min': 1500.0, 'max': 2250.0}, {'current': 1652.559, 'min': 1500.0, 'max': 2250.0}, {'current': 3323.654, 'min': 1500.0, 'max': 2250.0}, {'current': 1671.311, 'min': 1500.0, 'max': 2250.0}, {'current': 1726.286, 'min': 1500.0, 'max': 2250.0}, {'current': 1670.365, 'min': 1500.0, 'max': 2250.0}, {'current': 3320.57, 'min': 1500.0, 'max': 2250.0}, {'current': 1669.941, 'min': 1500.0, 'max': 2250.0}, {'current': 1791.021, 'min': 1500.0, 'max': 2250.0}, {'current': 1796.246, 'min': 1500.0, 'max': 2250.0}, {'current': 1793.946, 'min': 1500.0, 'max': 2250.0}, {'current': 1794.848, 'min': 1500.0, 'max': 2250.0}, {'current': 3339.327, 'min': 1500.0, 'max': 2250.0}, {'current': 3344.315, 'min': 1500.0, 'max': 2250.0}, {'current': 3338.901, 'min': 1500.0, 'max': 2250.0}, {'current': 1668.541, 'min': 1500.0, 'max': 2250.0}, {'current': 1794.526, 'min': 1500.0, 'max': 2250.0}, {'current': 1792.886, 'min': 1500.0, 'max': 2250.0}, {'current': 1796.844, 'min': 1500.0, 'max': 2250.0}, {'current': 1793.81, 'min': 1500.0, 'max': 2250.0}, {'current': 1724.861, 'min': 1500.0, 'max': 2250.0}, {'current': 2294.458, 'min': 1500.0, 'max': 2250.0}, {'current': 1720.835, 'min': 1500.0, 'max': 2250.0}, {'current': 1720.155, 'min': 1500.0, 'max': 2250.0}, {'current': 1668.96, 'min': 1500.0, 'max': 2250.0}, {'current': 1976.5, 'min': 1500.0, 'max': 2250.0}, {'current': 2241.578, 'min': 1500.0, 'max': 2250.0}, {'current': 1671.964, 'min': 1500.0, 'max': 2250.0}, {'current': 3319.623, 'min': 1500.0, 'max': 2250.0}, {'current': 1670.777, 'min': 1500.0, 'max': 2250.0}, {'current': 1670.389, 'min': 1500.0, 'max': 2250.0}, {'current': 1669.629, 'min': 1500.0, 'max': 2250.0}, {'current': 1794.19, 'min': 1500.0, 'max': 2250.0}, {'current': 1794.138, 'min': 1500.0, 'max': 2250.0}, {'current': 1796.317, 'min': 1500.0, 'max': 2250.0}, {'current': 1792.821, 'min': 1500.0, 'max': 2250.0}, {'current': 1794.716, 'min': 1500.0, 'max': 2250.0}, {'current': 1793.624, 'min': 1500.0, 'max': 2250.0}, {'current': 1796.346, 'min': 1500.0, 'max': 2250.0}, {'current': 1793.897, 'min': 1500.0, 'max': 2250.0}, {'current': 1735.424, 'min': 1500.0, 'max': 2250.0}, {'current': 1738.64, 'min': 1500.0, 'max': 2250.0}, {'current': 1979.998, 'min': 1500.0, 'max': 2250.0}, {'current': 1737.286, 'min': 1500.0, 'max': 2250.0}, {'current': 3313.748, 'min': 1500.0, 'max': 2250.0}, {'current': 3337.223, 'min': 1500.0, 'max': 2250.0}, {'current': 1671.416, 'min': 1500.0, 'max': 2250.0}, {'current': 1670.005, 'min': 1500.0, 'max': 2250.0}, {'current': 1794.276, 'min': 1500.0, 'max': 2250.0}, {'current': 1738.22, 'min': 1500.0, 'max': 2250.0}, {'current': 1742.737, 'min': 1500.0, 'max': 2250.0}, {'current': 1770.535, 'min': 1500.0, 'max': 2250.0}, {'current': 3320.252, 'min': 1500.0, 'max': 2250.0}, {'current': 1671.037, 'min': 1500.0, 'max': 2250.0}, {'current': 1669.549, 'min': 1500.0, 'max': 2250.0}, {'current': 1670.948, 'min': 1500.0, 'max': 2250.0}, {'current': 2843.391, 'min': 1500.0, 'max': 2250.0}, {'current': 2348.589, 'min': 1500.0, 'max': 2250.0}, {'current': 3287.915, 'min': 1500.0, 'max': 2250.0}, {'current': 2340.192, 'min': 1500.0, 'max': 2250.0}, {'current': 2426.358, 'min': 1500.0, 'max': 2250.0}, {'current': 2415.833, 'min': 1500.0, 'max': 2250.0}, {'current': 2419.416, 'min': 1500.0, 'max': 2250.0}, {'current': 2277.433, 'min': 1500.0, 'max': 2250.0}, {'current': 2365.562, 'min': 1500.0, 'max': 2250.0}, {'current': 2400.6, 'min': 1500.0, 'max': 2250.0}, {'current': 2075.143, 'min': 1500.0, 'max': 2250.0}, {'current': 2382.295, 'min': 1500.0, 'max': 2250.0}, {'current': 3066.339, 'min': 1500.0, 'max': 2250.0}, {'current': 2466.631, 'min': 1500.0, 'max': 2250.0}, {'current': 3100.81, 'min': 1500.0, 'max': 2250.0}, {'current': 2421.93, 'min': 1500.0, 'max': 2250.0}, {'current': 3233.829, 'min': 1500.0, 'max': 2250.0}, {'current': 2234.583, 'min': 1500.0, 'max': 2250.0}, {'current': 2452.089, 'min': 1500.0, 'max': 2250.0}, {'current': 2975.985, 'min': 1500.0, 'max': 2250.0}, {'current': 3301.512, 'min': 1500.0, 'max': 2250.0}, {'current': 3336.905, 'min': 1500.0, 'max': 2250.0}, {'current': 2984.87, 'min': 1500.0, 'max': 2250.0}, {'current': 2384.306, 'min': 1500.0, 'max': 2250.0}, {'current': 2965.197, 'min': 1500.0, 'max': 2250.0}, {'current': 1929.067, 'min': 1500.0, 'max': 2250.0}, {'current': 1986.731, 'min': 1500.0, 'max': 2250.0}, {'current': 1999.412, 'min': 1500.0, 'max': 2250.0}, {'current': 2477.541, 'min': 1500.0, 'max': 2250.0}, {'current': 3111.851, 'min': 1500.0, 'max': 2250.0}, {'current': 2009.907, 'min': 1500.0, 'max': 2250.0}, {'current': 1993.784, 'min': 1500.0, 'max': 2250.0}, {'current': 2144.459, 'min': 1500.0, 'max': 2250.0}, {'current': 3337.426, 'min': 1500.0, 'max': 2250.0}, {'current': 3320.114, 'min': 1500.0, 'max': 2250.0}, {'current': 2169.719, 'min': 1500.0, 'max': 2250.0}, {'current': 3308.644, 'min': 1500.0, 'max': 2250.0}, {'current': 2111.633, 'min': 1500.0, 'max': 2250.0}, {'current': 2123.71, 'min': 1500.0, 'max': 2250.0}, {'current': 2153.49, 'min': 1500.0, 'max': 2250.0}], 'disk': {'/': {'total': 1757.8785285949707, 'used': 1663.5005989074707}}, 'gpu': 'NVIDIA A100-SXM4-80GB', 'gpu_count': 5, 'gpu_devices': [{'name': 'NVIDIA A100-SXM4-80GB', 'memory_total': 85899345920}, {'name': 'NVIDIA A100-SXM4-80GB', 'memory_total': 85899345920}, {'name': 'NVIDIA A100-SXM4-80GB', 'memory_total': 85899345920}, {'name': 'NVIDIA DGX Display', 'memory_total': 4294967296}, {'name': 'NVIDIA A100-SXM4-80GB', 'memory_total': 85899345920}], 'memory': {'total': 503.5396919250488}}
26
+ 2024-05-13 20:46:53,372 INFO HandlerThread:1244775 [system_monitor.py:probe():224] Finished collecting system info
27
+ 2024-05-13 20:46:53,372 INFO HandlerThread:1244775 [system_monitor.py:probe():227] Publishing system info
28
+ 2024-05-13 20:46:53,372 DEBUG HandlerThread:1244775 [system_info.py:_save_conda():207] Saving list of conda packages installed into the current environment
29
+ 2024-05-13 20:46:53,387 ERROR gpu :1244775 [interfaces.py:monitor():142] Failed to sample metric: Not Supported
30
+ 2024-05-13 20:46:53,400 ERROR gpu :1244775 [interfaces.py:monitor():142] Failed to sample metric: Not Supported
31
+ 2024-05-13 20:46:54,224 INFO Thread-12 :1244775 [dir_watcher.py:_on_file_created():271] file/dir created: /raid/sanchit/parler-tts-mini-v0.1-expresso-concatenated-combined/wandb/run-20240513_204652-m0g0ap7d/files/conda-environment.yaml
32
+ 2024-05-13 20:46:55,418 ERROR gpu :1244775 [interfaces.py:monitor():142] Failed to sample metric: Not Supported
33
+ 2024-05-13 20:46:55,429 ERROR gpu :1244775 [interfaces.py:monitor():142] Failed to sample metric: Not Supported
34
+ 2024-05-13 20:46:55,741 DEBUG HandlerThread:1244775 [system_info.py:_save_conda():222] Saving conda packages done
35
+ 2024-05-13 20:46:55,742 INFO HandlerThread:1244775 [system_monitor.py:probe():229] Finished publishing system info
36
+ 2024-05-13 20:46:55,750 DEBUG SenderThread:1244775 [sender.py:send():378] send: files
37
+ 2024-05-13 20:46:55,750 INFO SenderThread:1244775 [sender.py:_save_file():1389] saving file wandb-metadata.json with policy now
38
+ 2024-05-13 20:46:55,863 DEBUG HandlerThread:1244775 [handler.py:handle_request():158] handle_request: python_packages
39
+ 2024-05-13 20:46:55,863 DEBUG HandlerThread:1244775 [handler.py:handle_request():158] handle_request: stop_status
40
+ 2024-05-13 20:46:55,863 DEBUG SenderThread:1244775 [sender.py:send_request():405] send_request: python_packages
41
+ 2024-05-13 20:46:55,865 DEBUG SenderThread:1244775 [sender.py:send_request():405] send_request: stop_status
42
+ 2024-05-13 20:46:56,093 DEBUG SenderThread:1244775 [sender.py:send():378] send: telemetry
43
+ 2024-05-13 20:46:56,093 DEBUG SenderThread:1244775 [sender.py:send():378] send: config
44
+ 2024-05-13 20:46:56,224 INFO Thread-12 :1244775 [dir_watcher.py:_on_file_modified():288] file/dir modified: /raid/sanchit/parler-tts-mini-v0.1-expresso-concatenated-combined/wandb/run-20240513_204652-m0g0ap7d/files/conda-environment.yaml
45
+ 2024-05-13 20:46:56,224 INFO Thread-12 :1244775 [dir_watcher.py:_on_file_created():271] file/dir created: /raid/sanchit/parler-tts-mini-v0.1-expresso-concatenated-combined/wandb/run-20240513_204652-m0g0ap7d/files/wandb-metadata.json
46
+ 2024-05-13 20:46:56,224 INFO Thread-12 :1244775 [dir_watcher.py:_on_file_created():271] file/dir created: /raid/sanchit/parler-tts-mini-v0.1-expresso-concatenated-combined/wandb/run-20240513_204652-m0g0ap7d/files/requirements.txt
47
+ 2024-05-13 20:46:56,224 INFO Thread-12 :1244775 [dir_watcher.py:_on_file_created():271] file/dir created: /raid/sanchit/parler-tts-mini-v0.1-expresso-concatenated-combined/wandb/run-20240513_204652-m0g0ap7d/files/output.log
48
+ 2024-05-13 20:46:56,261 INFO wandb-upload_0:1244775 [upload_job.py:push():130] Uploaded file /tmp/tmpewnm1an9wandb/l0duo91p-wandb-metadata.json
49
+ 2024-05-13 20:46:58,225 INFO Thread-12 :1244775 [dir_watcher.py:_on_file_modified():288] file/dir modified: /raid/sanchit/parler-tts-mini-v0.1-expresso-concatenated-combined/wandb/run-20240513_204652-m0g0ap7d/files/output.log
50
+ 2024-05-13 20:46:58,331 ERROR gpu :1244775 [interfaces.py:monitor():142] Failed to sample metric: Not Supported
51
+ 2024-05-13 20:46:58,343 ERROR gpu :1244775 [interfaces.py:monitor():142] Failed to sample metric: Not Supported
52
+ 2024-05-13 20:46:58,386 DEBUG HandlerThread:1244775 [handler.py:handle_request():158] handle_request: status_report
53
+ 2024-05-13 20:47:00,362 ERROR gpu :1244775 [interfaces.py:monitor():142] Failed to sample metric: Not Supported
54
+ 2024-05-13 20:47:00,375 ERROR gpu :1244775 [interfaces.py:monitor():142] Failed to sample metric: Not Supported
55
+ 2024-05-13 20:47:03,387 DEBUG HandlerThread:1244775 [handler.py:handle_request():158] handle_request: status_report
56
+ 2024-05-13 20:47:03,431 ERROR gpu :1244775 [interfaces.py:monitor():142] Failed to sample metric: Not Supported
57
+ 2024-05-13 20:47:03,443 ERROR gpu :1244775 [interfaces.py:monitor():142] Failed to sample metric: Not Supported
58
+ 2024-05-13 20:47:05,466 ERROR gpu :1244775 [interfaces.py:monitor():142] Failed to sample metric: Not Supported
59
+ 2024-05-13 20:47:05,478 ERROR gpu :1244775 [interfaces.py:monitor():142] Failed to sample metric: Not Supported
60
+ 2024-05-13 20:47:08,388 DEBUG HandlerThread:1244775 [handler.py:handle_request():158] handle_request: status_report
61
+ 2024-05-13 20:47:08,679 ERROR gpu :1244775 [interfaces.py:monitor():142] Failed to sample metric: Not Supported
62
+ 2024-05-13 20:47:08,692 ERROR gpu :1244775 [interfaces.py:monitor():142] Failed to sample metric: Not Supported
63
+ 2024-05-13 20:47:10,713 ERROR gpu :1244775 [interfaces.py:monitor():142] Failed to sample metric: Not Supported
64
+ 2024-05-13 20:47:10,724 ERROR gpu :1244775 [interfaces.py:monitor():142] Failed to sample metric: Not Supported
65
+ 2024-05-13 20:47:10,863 DEBUG HandlerThread:1244775 [handler.py:handle_request():158] handle_request: stop_status
66
+ 2024-05-13 20:47:10,863 DEBUG SenderThread:1244775 [sender.py:send_request():405] send_request: stop_status
67
+ 2024-05-13 20:47:12,746 ERROR gpu :1244775 [interfaces.py:monitor():142] Failed to sample metric: Not Supported
68
+ 2024-05-13 20:47:12,757 ERROR gpu :1244775 [interfaces.py:monitor():142] Failed to sample metric: Not Supported
69
+ 2024-05-13 20:47:14,094 DEBUG HandlerThread:1244775 [handler.py:handle_request():158] handle_request: status_report
70
+ 2024-05-13 20:47:15,627 ERROR gpu :1244775 [interfaces.py:monitor():142] Failed to sample metric: Not Supported
71
+ 2024-05-13 20:47:15,637 ERROR gpu :1244775 [interfaces.py:monitor():142] Failed to sample metric: Not Supported
72
+ 2024-05-13 20:47:17,658 ERROR gpu :1244775 [interfaces.py:monitor():142] Failed to sample metric: Not Supported
73
+ 2024-05-13 20:47:17,668 ERROR gpu :1244775 [interfaces.py:monitor():142] Failed to sample metric: Not Supported
74
+ 2024-05-13 20:47:19,096 DEBUG HandlerThread:1244775 [handler.py:handle_request():158] handle_request: status_report
75
+ 2024-05-13 20:47:20,779 ERROR gpu :1244775 [interfaces.py:monitor():142] Failed to sample metric: Not Supported
76
+ 2024-05-13 20:47:20,799 ERROR gpu :1244775 [interfaces.py:monitor():142] Failed to sample metric: Not Supported
77
+ 2024-05-13 20:47:22,817 ERROR gpu :1244775 [interfaces.py:monitor():142] Failed to sample metric: Not Supported
78
+ 2024-05-13 20:47:22,830 ERROR gpu :1244775 [interfaces.py:monitor():142] Failed to sample metric: Not Supported
79
+ 2024-05-13 20:47:24,099 DEBUG HandlerThread:1244775 [handler.py:handle_request():158] handle_request: status_report
80
+ 2024-05-13 20:47:24,858 ERROR gpu :1244775 [interfaces.py:monitor():142] Failed to sample metric: Not Supported
81
+ 2024-05-13 20:47:24,870 ERROR gpu :1244775 [interfaces.py:monitor():142] Failed to sample metric: Not Supported
82
+ 2024-05-13 20:47:25,233 INFO Thread-12 :1244775 [dir_watcher.py:_on_file_modified():288] file/dir modified: /raid/sanchit/parler-tts-mini-v0.1-expresso-concatenated-combined/wandb/run-20240513_204652-m0g0ap7d/files/config.yaml
83
+ 2024-05-13 20:47:25,863 DEBUG HandlerThread:1244775 [handler.py:handle_request():158] handle_request: stop_status
84
+ 2024-05-13 20:47:25,863 DEBUG SenderThread:1244775 [sender.py:send_request():405] send_request: stop_status
85
+ 2024-05-13 20:47:27,736 ERROR gpu :1244775 [interfaces.py:monitor():142] Failed to sample metric: Not Supported
86
+ 2024-05-13 20:47:27,747 ERROR gpu :1244775 [interfaces.py:monitor():142] Failed to sample metric: Not Supported
87
+ 2024-05-13 20:47:30,092 DEBUG HandlerThread:1244775 [handler.py:handle_request():158] handle_request: status_report
88
+ 2024-05-13 20:47:31,414 ERROR gpu :1244775 [interfaces.py:aggregate():159] Failed to serialize metric: division by zero
89
+ 2024-05-13 20:47:31,434 ERROR gpu :1244775 [interfaces.py:monitor():142] Failed to sample metric: Not Supported
90
+ 2024-05-13 20:47:31,462 ERROR gpu :1244775 [interfaces.py:monitor():142] Failed to sample metric: Not Supported
91
+ 2024-05-13 20:47:34,494 ERROR gpu :1244775 [interfaces.py:monitor():142] Failed to sample metric: Not Supported
92
+ 2024-05-13 20:47:34,518 ERROR gpu :1244775 [interfaces.py:monitor():142] Failed to sample metric: Not Supported
93
+ 2024-05-13 20:47:35,093 DEBUG HandlerThread:1244775 [handler.py:handle_request():158] handle_request: status_report
94
+ 2024-05-13 20:47:36,569 ERROR gpu :1244775 [interfaces.py:monitor():142] Failed to sample metric: Not Supported
95
+ 2024-05-13 20:47:36,599 ERROR gpu :1244775 [interfaces.py:monitor():142] Failed to sample metric: Not Supported
96
+ 2024-05-13 20:47:38,635 ERROR gpu :1244775 [interfaces.py:monitor():142] Failed to sample metric: Not Supported
97
+ 2024-05-13 20:47:38,658 ERROR gpu :1244775 [interfaces.py:monitor():142] Failed to sample metric: Not Supported
98
+ 2024-05-13 20:47:40,093 DEBUG HandlerThread:1244775 [handler.py:handle_request():158] handle_request: status_report
99
+ 2024-05-13 20:47:40,863 DEBUG HandlerThread:1244775 [handler.py:handle_request():158] handle_request: stop_status
100
+ 2024-05-13 20:47:40,864 DEBUG SenderThread:1244775 [sender.py:send_request():405] send_request: stop_status
101
+ 2024-05-13 20:47:41,560 ERROR gpu :1244775 [interfaces.py:monitor():142] Failed to sample metric: Not Supported
102
+ 2024-05-13 20:47:41,584 ERROR gpu :1244775 [interfaces.py:monitor():142] Failed to sample metric: Not Supported
103
+ 2024-05-13 20:47:43,631 ERROR gpu :1244775 [interfaces.py:monitor():142] Failed to sample metric: Not Supported
104
+ 2024-05-13 20:47:43,652 ERROR gpu :1244775 [interfaces.py:monitor():142] Failed to sample metric: Not Supported
105
+ 2024-05-13 20:47:46,084 DEBUG HandlerThread:1244775 [handler.py:handle_request():158] handle_request: status_report
106
+ 2024-05-13 20:47:46,570 ERROR gpu :1244775 [interfaces.py:monitor():142] Failed to sample metric: Not Supported
107
+ 2024-05-13 20:47:46,604 ERROR gpu :1244775 [interfaces.py:monitor():142] Failed to sample metric: Not Supported
108
+ 2024-05-13 20:47:48,647 ERROR gpu :1244775 [interfaces.py:monitor():142] Failed to sample metric: Not Supported
109
+ 2024-05-13 20:47:48,659 ERROR gpu :1244775 [interfaces.py:monitor():142] Failed to sample metric: Not Supported
110
+ 2024-05-13 20:47:51,084 DEBUG HandlerThread:1244775 [handler.py:handle_request():158] handle_request: status_report
111
+ 2024-05-13 20:47:51,664 ERROR gpu :1244775 [interfaces.py:monitor():142] Failed to sample metric: Not Supported
112
+ 2024-05-13 20:47:51,686 ERROR gpu :1244775 [interfaces.py:monitor():142] Failed to sample metric: Not Supported
113
+ 2024-05-13 20:47:53,329 DEBUG SystemMonitor:1244775 [system_monitor.py:_start():172] Starting system metrics aggregation loop
114
+ 2024-05-13 20:47:53,333 DEBUG SenderThread:1244775 [sender.py:send():378] send: stats
115
+ 2024-05-13 20:47:53,709 ERROR gpu :1244775 [interfaces.py:monitor():142] Failed to sample metric: Not Supported
116
+ 2024-05-13 20:47:53,724 ERROR gpu :1244775 [interfaces.py:monitor():142] Failed to sample metric: Not Supported
117
+ 2024-05-13 20:47:55,863 DEBUG HandlerThread:1244775 [handler.py:handle_request():158] handle_request: stop_status
118
+ 2024-05-13 20:47:55,864 DEBUG SenderThread:1244775 [sender.py:send_request():405] send_request: stop_status
119
+ 2024-05-13 20:47:56,593 ERROR gpu :1244775 [interfaces.py:monitor():142] Failed to sample metric: Not Supported
120
+ 2024-05-13 20:47:56,607 ERROR gpu :1244775 [interfaces.py:monitor():142] Failed to sample metric: Not Supported
121
+ 2024-05-13 20:47:57,080 DEBUG HandlerThread:1244775 [handler.py:handle_request():158] handle_request: status_report
122
+ 2024-05-13 20:47:58,627 ERROR gpu :1244775 [interfaces.py:monitor():142] Failed to sample metric: Not Supported
123
+ 2024-05-13 20:47:58,641 ERROR gpu :1244775 [interfaces.py:monitor():142] Failed to sample metric: Not Supported
124
+ 2024-05-13 20:48:01,653 ERROR gpu :1244775 [interfaces.py:monitor():142] Failed to sample metric: Not Supported
125
+ 2024-05-13 20:48:01,664 ERROR gpu :1244775 [interfaces.py:monitor():142] Failed to sample metric: Not Supported
126
+ 2024-05-13 20:48:02,081 DEBUG HandlerThread:1244775 [handler.py:handle_request():158] handle_request: status_report
127
+ 2024-05-13 20:48:03,684 ERROR gpu :1244775 [interfaces.py:monitor():142] Failed to sample metric: Not Supported
128
+ 2024-05-13 20:48:03,696 ERROR gpu :1244775 [interfaces.py:monitor():142] Failed to sample metric: Not Supported
129
+ 2024-05-13 20:48:06,665 ERROR gpu :1244775 [interfaces.py:monitor():142] Failed to sample metric: Not Supported
130
+ 2024-05-13 20:48:06,679 ERROR gpu :1244775 [interfaces.py:monitor():142] Failed to sample metric: Not Supported
131
+ 2024-05-13 20:48:07,082 DEBUG HandlerThread:1244775 [handler.py:handle_request():158] handle_request: status_report
132
+ 2024-05-13 20:48:10,344 ERROR gpu :1244775 [interfaces.py:aggregate():159] Failed to serialize metric: division by zero
133
+ 2024-05-13 20:48:10,366 ERROR gpu :1244775 [interfaces.py:monitor():142] Failed to sample metric: Not Supported
134
+ 2024-05-13 20:48:10,380 ERROR gpu :1244775 [interfaces.py:monitor():142] Failed to sample metric: Not Supported
135
+ 2024-05-13 20:48:10,863 DEBUG HandlerThread:1244775 [handler.py:handle_request():158] handle_request: stop_status
136
+ 2024-05-13 20:48:10,864 DEBUG SenderThread:1244775 [sender.py:send_request():405] send_request: stop_status
137
+ 2024-05-13 20:48:13,048 DEBUG HandlerThread:1244775 [handler.py:handle_request():158] handle_request: status_report
138
+ 2024-05-13 20:48:13,506 ERROR gpu :1244775 [interfaces.py:monitor():142] Failed to sample metric: Not Supported
139
+ 2024-05-13 20:48:13,529 ERROR gpu :1244775 [interfaces.py:monitor():142] Failed to sample metric: Not Supported
140
+ 2024-05-13 20:48:15,558 ERROR gpu :1244775 [interfaces.py:monitor():142] Failed to sample metric: Not Supported
141
+ 2024-05-13 20:48:15,586 ERROR gpu :1244775 [interfaces.py:monitor():142] Failed to sample metric: Not Supported
142
+ 2024-05-13 20:48:18,050 DEBUG HandlerThread:1244775 [handler.py:handle_request():158] handle_request: status_report
143
+ 2024-05-13 20:48:18,552 ERROR gpu :1244775 [interfaces.py:monitor():142] Failed to sample metric: Not Supported
144
+ 2024-05-13 20:48:18,572 ERROR gpu :1244775 [interfaces.py:monitor():142] Failed to sample metric: Not Supported
145
+ 2024-05-13 20:48:20,626 ERROR gpu :1244775 [interfaces.py:monitor():142] Failed to sample metric: Not Supported
146
+ 2024-05-13 20:48:20,644 ERROR gpu :1244775 [interfaces.py:monitor():142] Failed to sample metric: Not Supported
147
+ 2024-05-13 20:48:23,050 DEBUG HandlerThread:1244775 [handler.py:handle_request():158] handle_request: status_report
148
+ 2024-05-13 20:48:23,336 DEBUG SenderThread:1244775 [sender.py:send():378] send: stats
149
+ 2024-05-13 20:48:23,683 ERROR gpu :1244775 [interfaces.py:monitor():142] Failed to sample metric: Not Supported
150
+ 2024-05-13 20:48:23,707 ERROR gpu :1244775 [interfaces.py:monitor():142] Failed to sample metric: Not Supported
151
+ 2024-05-13 20:48:25,750 ERROR gpu :1244775 [interfaces.py:monitor():142] Failed to sample metric: Not Supported
152
+ 2024-05-13 20:48:25,769 ERROR gpu :1244775 [interfaces.py:monitor():142] Failed to sample metric: Not Supported
153
+ 2024-05-13 20:48:25,863 DEBUG HandlerThread:1244775 [handler.py:handle_request():158] handle_request: stop_status
154
+ 2024-05-13 20:48:25,864 DEBUG SenderThread:1244775 [sender.py:send_request():405] send_request: stop_status
155
+ 2024-05-13 20:48:28,681 ERROR gpu :1244775 [interfaces.py:monitor():142] Failed to sample metric: Not Supported
156
+ 2024-05-13 20:48:28,701 ERROR gpu :1244775 [interfaces.py:monitor():142] Failed to sample metric: Not Supported
157
+ 2024-05-13 20:48:29,005 DEBUG HandlerThread:1244775 [handler.py:handle_request():158] handle_request: status_report
158
+ 2024-05-13 20:48:30,725 ERROR gpu :1244775 [interfaces.py:monitor():142] Failed to sample metric: Not Supported
159
+ 2024-05-13 20:48:30,747 ERROR gpu :1244775 [interfaces.py:monitor():142] Failed to sample metric: Not Supported
160
+ 2024-05-13 20:48:33,782 ERROR gpu :1244775 [interfaces.py:monitor():142] Failed to sample metric: Not Supported
161
+ 2024-05-13 20:48:33,801 ERROR gpu :1244775 [interfaces.py:monitor():142] Failed to sample metric: Not Supported
162
+ 2024-05-13 20:48:34,006 DEBUG HandlerThread:1244775 [handler.py:handle_request():158] handle_request: status_report
163
+ 2024-05-13 20:48:35,835 ERROR gpu :1244775 [interfaces.py:monitor():142] Failed to sample metric: Not Supported
164
+ 2024-05-13 20:48:35,858 ERROR gpu :1244775 [interfaces.py:monitor():142] Failed to sample metric: Not Supported
165
+ 2024-05-13 20:48:38,877 ERROR gpu :1244775 [interfaces.py:monitor():142] Failed to sample metric: Not Supported
166
+ 2024-05-13 20:48:38,904 ERROR gpu :1244775 [interfaces.py:monitor():142] Failed to sample metric: Not Supported
167
+ 2024-05-13 20:48:39,007 DEBUG HandlerThread:1244775 [handler.py:handle_request():158] handle_request: status_report
168
+ 2024-05-13 20:48:40,863 DEBUG HandlerThread:1244775 [handler.py:handle_request():158] handle_request: stop_status
169
+ 2024-05-13 20:48:40,864 DEBUG SenderThread:1244775 [sender.py:send_request():405] send_request: stop_status
170
+ 2024-05-13 20:48:40,932 ERROR gpu :1244775 [interfaces.py:monitor():142] Failed to sample metric: Not Supported
171
+ 2024-05-13 20:48:40,946 ERROR gpu :1244775 [interfaces.py:monitor():142] Failed to sample metric: Not Supported
172
+ 2024-05-13 20:48:42,969 ERROR gpu :1244775 [interfaces.py:monitor():142] Failed to sample metric: Not Supported
173
+ 2024-05-13 20:48:42,980 ERROR gpu :1244775 [interfaces.py:monitor():142] Failed to sample metric: Not Supported
174
+ 2024-05-13 20:48:44,036 DEBUG HandlerThread:1244775 [handler.py:handle_request():158] handle_request: status_report
175
+ 2024-05-13 20:48:45,801 ERROR gpu :1244775 [interfaces.py:monitor():142] Failed to sample metric: Not Supported
176
+ 2024-05-13 20:48:45,831 ERROR gpu :1244775 [interfaces.py:monitor():142] Failed to sample metric: Not Supported
177
+ 2024-05-13 20:48:49,037 DEBUG HandlerThread:1244775 [handler.py:handle_request():158] handle_request: status_report
178
+ 2024-05-13 20:48:49,512 ERROR gpu :1244775 [interfaces.py:aggregate():159] Failed to serialize metric: division by zero
179
+ 2024-05-13 20:48:49,550 ERROR gpu :1244775 [interfaces.py:monitor():142] Failed to sample metric: Not Supported
180
+ 2024-05-13 20:48:49,567 ERROR gpu :1244775 [interfaces.py:monitor():142] Failed to sample metric: Not Supported
181
+ 2024-05-13 20:48:52,479 ERROR gpu :1244775 [interfaces.py:monitor():142] Failed to sample metric: Not Supported
182
+ 2024-05-13 20:48:52,494 ERROR gpu :1244775 [interfaces.py:monitor():142] Failed to sample metric: Not Supported
183
+ 2024-05-13 20:48:53,338 DEBUG SenderThread:1244775 [sender.py:send():378] send: stats
184
+ 2024-05-13 20:48:54,339 DEBUG HandlerThread:1244775 [handler.py:handle_request():158] handle_request: status_report
185
+ 2024-05-13 20:48:54,518 ERROR gpu :1244775 [interfaces.py:monitor():142] Failed to sample metric: Not Supported
186
+ 2024-05-13 20:48:54,530 ERROR gpu :1244775 [interfaces.py:monitor():142] Failed to sample metric: Not Supported
187
+ 2024-05-13 20:48:55,864 DEBUG HandlerThread:1244775 [handler.py:handle_request():158] handle_request: stop_status
188
+ 2024-05-13 20:48:55,864 DEBUG SenderThread:1244775 [sender.py:send_request():405] send_request: stop_status
189
+ 2024-05-13 20:48:57,576 ERROR gpu :1244775 [interfaces.py:monitor():142] Failed to sample metric: Not Supported
190
+ 2024-05-13 20:48:57,589 ERROR gpu :1244775 [interfaces.py:monitor():142] Failed to sample metric: Not Supported
191
+ 2024-05-13 20:48:59,615 ERROR gpu :1244775 [interfaces.py:monitor():142] Failed to sample metric: Not Supported
192
+ 2024-05-13 20:48:59,628 ERROR gpu :1244775 [interfaces.py:monitor():142] Failed to sample metric: Not Supported
193
+ 2024-05-13 20:49:00,007 DEBUG HandlerThread:1244775 [handler.py:handle_request():158] handle_request: status_report
194
+ 2024-05-13 20:49:02,716 ERROR gpu :1244775 [interfaces.py:monitor():142] Failed to sample metric: Not Supported
195
+ 2024-05-13 20:49:02,730 ERROR gpu :1244775 [interfaces.py:monitor():142] Failed to sample metric: Not Supported
196
+ 2024-05-13 20:49:04,748 ERROR gpu :1244775 [interfaces.py:monitor():142] Failed to sample metric: Not Supported
197
+ 2024-05-13 20:49:04,763 ERROR gpu :1244775 [interfaces.py:monitor():142] Failed to sample metric: Not Supported
198
+ 2024-05-13 20:49:05,008 DEBUG HandlerThread:1244775 [handler.py:handle_request():158] handle_request: status_report
199
+ 2024-05-13 20:49:07,719 ERROR gpu :1244775 [interfaces.py:monitor():142] Failed to sample metric: Not Supported
200
+ 2024-05-13 20:49:07,741 ERROR gpu :1244775 [interfaces.py:monitor():142] Failed to sample metric: Not Supported
201
+ 2024-05-13 20:49:09,766 ERROR gpu :1244775 [interfaces.py:monitor():142] Failed to sample metric: Not Supported
202
+ 2024-05-13 20:49:09,776 ERROR gpu :1244775 [interfaces.py:monitor():142] Failed to sample metric: Not Supported
203
+ 2024-05-13 20:49:10,009 DEBUG HandlerThread:1244775 [handler.py:handle_request():158] handle_request: status_report
204
+ 2024-05-13 20:49:10,864 DEBUG HandlerThread:1244775 [handler.py:handle_request():158] handle_request: stop_status
205
+ 2024-05-13 20:49:10,864 DEBUG SenderThread:1244775 [sender.py:send_request():405] send_request: stop_status
206
+ 2024-05-13 20:49:12,861 ERROR gpu :1244775 [interfaces.py:monitor():142] Failed to sample metric: Not Supported
207
+ 2024-05-13 20:49:12,875 ERROR gpu :1244775 [interfaces.py:monitor():142] Failed to sample metric: Not Supported
208
+ 2024-05-13 20:49:14,916 ERROR gpu :1244775 [interfaces.py:monitor():142] Failed to sample metric: Not Supported
209
+ 2024-05-13 20:49:14,934 ERROR gpu :1244775 [interfaces.py:monitor():142] Failed to sample metric: Not Supported
210
+ 2024-05-13 20:49:15,085 DEBUG HandlerThread:1244775 [handler.py:handle_request():158] handle_request: status_report
211
+ 2024-05-13 20:49:17,938 ERROR gpu :1244775 [interfaces.py:monitor():142] Failed to sample metric: Not Supported
212
+ 2024-05-13 20:49:17,957 ERROR gpu :1244775 [interfaces.py:monitor():142] Failed to sample metric: Not Supported
213
+ 2024-05-13 20:49:20,002 ERROR gpu :1244775 [interfaces.py:monitor():142] Failed to sample metric: Not Supported
214
+ 2024-05-13 20:49:20,012 ERROR gpu :1244775 [interfaces.py:monitor():142] Failed to sample metric: Not Supported
215
+ 2024-05-13 20:49:20,086 DEBUG HandlerThread:1244775 [handler.py:handle_request():158] handle_request: status_report
216
+ 2024-05-13 20:49:22,621 ERROR gpu :1244775 [interfaces.py:monitor():142] Failed to sample metric: Not Supported
217
+ 2024-05-13 20:49:22,641 ERROR gpu :1244775 [interfaces.py:monitor():142] Failed to sample metric: Not Supported
218
+ 2024-05-13 20:49:23,339 DEBUG SenderThread:1244775 [sender.py:send():378] send: stats
219
+ 2024-05-13 20:49:24,675 ERROR gpu :1244775 [interfaces.py:monitor():142] Failed to sample metric: Not Supported
220
+ 2024-05-13 20:49:24,686 ERROR gpu :1244775 [interfaces.py:monitor():142] Failed to sample metric: Not Supported
221
+ 2024-05-13 20:49:25,341 DEBUG HandlerThread:1244775 [handler.py:handle_request():158] handle_request: status_report
222
+ 2024-05-13 20:49:25,864 DEBUG HandlerThread:1244775 [handler.py:handle_request():158] handle_request: stop_status
223
+ 2024-05-13 20:49:25,865 DEBUG SenderThread:1244775 [sender.py:send_request():405] send_request: stop_status
224
+ 2024-05-13 20:49:28,370 ERROR gpu :1244775 [interfaces.py:aggregate():159] Failed to serialize metric: division by zero
225
+ 2024-05-13 20:49:28,612 ERROR gpu :1244775 [interfaces.py:monitor():142] Failed to sample metric: Not Supported
226
+ 2024-05-13 20:49:28,633 ERROR gpu :1244775 [interfaces.py:monitor():142] Failed to sample metric: Not Supported
227
+ 2024-05-13 20:49:30,656 ERROR gpu :1244775 [interfaces.py:monitor():142] Failed to sample metric: Not Supported
228
+ 2024-05-13 20:49:30,678 ERROR gpu :1244775 [interfaces.py:monitor():142] Failed to sample metric: Not Supported
229
+ 2024-05-13 20:49:31,038 DEBUG HandlerThread:1244775 [handler.py:handle_request():158] handle_request: status_report
230
+ 2024-05-13 20:49:33,121 ERROR gpu :1244775 [interfaces.py:monitor():142] Failed to sample metric: Not Supported
231
+ 2024-05-13 20:49:33,150 ERROR gpu :1244775 [interfaces.py:monitor():142] Failed to sample metric: Not Supported
232
+ 2024-05-13 20:49:35,177 ERROR gpu :1244775 [interfaces.py:monitor():142] Failed to sample metric: Not Supported
233
+ 2024-05-13 20:49:35,217 ERROR gpu :1244775 [interfaces.py:monitor():142] Failed to sample metric: Not Supported
234
+ 2024-05-13 20:49:36,039 DEBUG HandlerThread:1244775 [handler.py:handle_request():158] handle_request: status_report
235
+ 2024-05-13 20:49:37,932 ERROR gpu :1244775 [interfaces.py:monitor():142] Failed to sample metric: Not Supported
236
+ 2024-05-13 20:49:37,967 ERROR gpu :1244775 [interfaces.py:monitor():142] Failed to sample metric: Not Supported
237
+ 2024-05-13 20:49:40,002 ERROR gpu :1244775 [interfaces.py:monitor():142] Failed to sample metric: Not Supported
238
+ 2024-05-13 20:49:40,032 ERROR gpu :1244775 [interfaces.py:monitor():142] Failed to sample metric: Not Supported
239
+ 2024-05-13 20:49:40,864 DEBUG HandlerThread:1244775 [handler.py:handle_request():158] handle_request: stop_status
240
+ 2024-05-13 20:49:40,864 DEBUG SenderThread:1244775 [sender.py:send_request():405] send_request: stop_status
241
+ 2024-05-13 20:49:42,037 DEBUG HandlerThread:1244775 [handler.py:handle_request():158] handle_request: status_report
242
+ 2024-05-13 20:49:42,295 ERROR gpu :1244775 [interfaces.py:monitor():142] Failed to sample metric: Not Supported
243
+ 2024-05-13 20:49:42,620 ERROR gpu :1244775 [interfaces.py:monitor():142] Failed to sample metric: Not Supported
244
+ 2024-05-13 20:49:44,641 ERROR gpu :1244775 [interfaces.py:monitor():142] Failed to sample metric: Not Supported
245
+ 2024-05-13 20:49:44,688 ERROR gpu :1244775 [interfaces.py:monitor():142] Failed to sample metric: Not Supported
246
+ 2024-05-13 20:49:47,037 DEBUG HandlerThread:1244775 [handler.py:handle_request():158] handle_request: status_report
247
+ 2024-05-13 20:49:47,242 ERROR gpu :1244775 [interfaces.py:monitor():142] Failed to sample metric: Not Supported
248
+ 2024-05-13 20:49:47,286 ERROR gpu :1244775 [interfaces.py:monitor():142] Failed to sample metric: Not Supported
249
+ 2024-05-13 20:49:49,336 ERROR gpu :1244775 [interfaces.py:monitor():142] Failed to sample metric: Not Supported
250
+ 2024-05-13 20:49:49,362 ERROR gpu :1244775 [interfaces.py:monitor():142] Failed to sample metric: Not Supported
251
+ 2024-05-13 20:49:51,898 ERROR gpu :1244775 [interfaces.py:monitor():142] Failed to sample metric: Not Supported
252
+ 2024-05-13 20:49:51,927 ERROR gpu :1244775 [interfaces.py:monitor():142] Failed to sample metric: Not Supported
253
+ 2024-05-13 20:49:52,038 DEBUG HandlerThread:1244775 [handler.py:handle_request():158] handle_request: status_report
254
+ 2024-05-13 20:49:53,343 DEBUG SenderThread:1244775 [sender.py:send():378] send: stats
255
+ 2024-05-13 20:49:54,378 ERROR gpu :1244775 [interfaces.py:monitor():142] Failed to sample metric: Not Supported
256
+ 2024-05-13 20:49:54,396 ERROR gpu :1244775 [interfaces.py:monitor():142] Failed to sample metric: Not Supported
257
+ 2024-05-13 20:49:55,864 DEBUG HandlerThread:1244775 [handler.py:handle_request():158] handle_request: stop_status
258
+ 2024-05-13 20:49:55,864 DEBUG SenderThread:1244775 [sender.py:send_request():405] send_request: stop_status
259
+ 2024-05-13 20:49:56,417 ERROR gpu :1244775 [interfaces.py:monitor():142] Failed to sample metric: Not Supported
260
+ 2024-05-13 20:49:56,465 ERROR gpu :1244775 [interfaces.py:monitor():142] Failed to sample metric: Not Supported
261
+ 2024-05-13 20:49:58,028 DEBUG HandlerThread:1244775 [handler.py:handle_request():158] handle_request: status_report
262
+ 2024-05-13 20:49:58,997 ERROR gpu :1244775 [interfaces.py:monitor():142] Failed to sample metric: Not Supported
263
+ 2024-05-13 20:49:59,011 ERROR gpu :1244775 [interfaces.py:monitor():142] Failed to sample metric: Not Supported
264
+ 2024-05-13 20:50:01,057 ERROR gpu :1244775 [interfaces.py:monitor():142] Failed to sample metric: Not Supported
265
+ 2024-05-13 20:50:01,097 ERROR gpu :1244775 [interfaces.py:monitor():142] Failed to sample metric: Not Supported
266
+ 2024-05-13 20:50:03,029 DEBUG HandlerThread:1244775 [handler.py:handle_request():158] handle_request: status_report
267
+ 2024-05-13 20:50:05,200 ERROR gpu :1244775 [interfaces.py:aggregate():159] Failed to serialize metric: division by zero
268
+ 2024-05-13 20:50:05,253 ERROR gpu :1244775 [interfaces.py:monitor():142] Failed to sample metric: Not Supported
269
+ 2024-05-13 20:50:05,272 ERROR gpu :1244775 [interfaces.py:monitor():142] Failed to sample metric: Not Supported
270
+ 2024-05-13 20:50:07,311 ERROR gpu :1244775 [interfaces.py:monitor():142] Failed to sample metric: Not Supported
271
+ 2024-05-13 20:50:07,330 ERROR gpu :1244775 [interfaces.py:monitor():142] Failed to sample metric: Not Supported
272
+ 2024-05-13 20:50:08,029 DEBUG HandlerThread:1244775 [handler.py:handle_request():158] handle_request: status_report
273
+ 2024-05-13 20:50:09,887 ERROR gpu :1244775 [interfaces.py:monitor():142] Failed to sample metric: Not Supported
274
+ 2024-05-13 20:50:09,918 ERROR gpu :1244775 [interfaces.py:monitor():142] Failed to sample metric: Not Supported
275
+ 2024-05-13 20:50:10,864 DEBUG HandlerThread:1244775 [handler.py:handle_request():158] handle_request: stop_status
276
+ 2024-05-13 20:50:10,864 DEBUG SenderThread:1244775 [sender.py:send_request():405] send_request: stop_status
277
+ 2024-05-13 20:50:11,934 ERROR gpu :1244775 [interfaces.py:monitor():142] Failed to sample metric: Not Supported
278
+ 2024-05-13 20:50:11,942 ERROR gpu :1244775 [interfaces.py:monitor():142] Failed to sample metric: Not Supported
279
+ 2024-05-13 20:50:13,047 DEBUG HandlerThread:1244775 [handler.py:handle_request():158] handle_request: status_report
280
+ 2024-05-13 20:50:14,680 ERROR gpu :1244775 [interfaces.py:monitor():142] Failed to sample metric: Not Supported
281
+ 2024-05-13 20:50:14,697 ERROR gpu :1244775 [interfaces.py:monitor():142] Failed to sample metric: Not Supported
282
+ 2024-05-13 20:50:16,719 ERROR gpu :1244775 [interfaces.py:monitor():142] Failed to sample metric: Not Supported
283
+ 2024-05-13 20:50:16,738 ERROR gpu :1244775 [interfaces.py:monitor():142] Failed to sample metric: Not Supported
284
+ 2024-05-13 20:50:18,047 DEBUG HandlerThread:1244775 [handler.py:handle_request():158] handle_request: status_report
285
+ 2024-05-13 20:50:19,232 ERROR gpu :1244775 [interfaces.py:monitor():142] Failed to sample metric: Not Supported
286
+ 2024-05-13 20:50:19,254 ERROR gpu :1244775 [interfaces.py:monitor():142] Failed to sample metric: Not Supported
287
+ 2024-05-13 20:50:21,272 ERROR gpu :1244775 [interfaces.py:monitor():142] Failed to sample metric: Not Supported
288
+ 2024-05-13 20:50:21,282 ERROR gpu :1244775 [interfaces.py:monitor():142] Failed to sample metric: Not Supported
289
+ 2024-05-13 20:50:23,048 DEBUG HandlerThread:1244775 [handler.py:handle_request():158] handle_request: status_report
290
+ 2024-05-13 20:50:23,346 DEBUG SenderThread:1244775 [sender.py:send():378] send: stats
291
+ 2024-05-13 20:50:23,802 ERROR gpu :1244775 [interfaces.py:monitor():142] Failed to sample metric: Not Supported
292
+ 2024-05-13 20:50:23,817 ERROR gpu :1244775 [interfaces.py:monitor():142] Failed to sample metric: Not Supported
293
+ 2024-05-13 20:50:25,839 ERROR gpu :1244775 [interfaces.py:monitor():142] Failed to sample metric: Not Supported
294
+ 2024-05-13 20:50:25,850 ERROR gpu :1244775 [interfaces.py:monitor():142] Failed to sample metric: Not Supported
295
+ 2024-05-13 20:50:25,864 DEBUG HandlerThread:1244775 [handler.py:handle_request():158] handle_request: stop_status
296
+ 2024-05-13 20:50:25,865 DEBUG SenderThread:1244775 [sender.py:send_request():405] send_request: stop_status
297
+ 2024-05-13 20:50:28,061 DEBUG HandlerThread:1244775 [handler.py:handle_request():158] handle_request: status_report
298
+ 2024-05-13 20:50:28,256 ERROR gpu :1244775 [interfaces.py:monitor():142] Failed to sample metric: Not Supported
299
+ 2024-05-13 20:50:28,265 ERROR gpu :1244775 [interfaces.py:monitor():142] Failed to sample metric: Not Supported
300
+ 2024-05-13 20:50:30,284 ERROR gpu :1244775 [interfaces.py:monitor():142] Failed to sample metric: Not Supported
301
+ 2024-05-13 20:50:30,296 ERROR gpu :1244775 [interfaces.py:monitor():142] Failed to sample metric: Not Supported
302
+ 2024-05-13 20:50:32,293 INFO Thread-12 :1244775 [dir_watcher.py:_on_file_modified():288] file/dir modified: /raid/sanchit/parler-tts-mini-v0.1-expresso-concatenated-combined/wandb/run-20240513_204652-m0g0ap7d/files/output.log
303
+ 2024-05-13 20:50:32,787 ERROR gpu :1244775 [interfaces.py:monitor():142] Failed to sample metric: Not Supported
304
+ 2024-05-13 20:50:32,796 ERROR gpu :1244775 [interfaces.py:monitor():142] Failed to sample metric: Not Supported
305
+ 2024-05-13 20:50:33,152 DEBUG HandlerThread:1244775 [handler.py:handle_request():158] handle_request: status_report
306
+ 2024-05-13 20:50:33,158 DEBUG SenderThread:1244775 [sender.py:send():378] send: exit
307
+ 2024-05-13 20:50:33,158 INFO SenderThread:1244775 [sender.py:send_exit():585] handling exit code: 1
308
+ 2024-05-13 20:50:33,159 INFO SenderThread:1244775 [sender.py:send_exit():587] handling runtime: 219
309
+ 2024-05-13 20:50:33,159 INFO SenderThread:1244775 [sender.py:_save_file():1389] saving file wandb-summary.json with policy end
310
+ 2024-05-13 20:50:33,159 INFO SenderThread:1244775 [sender.py:send_exit():593] send defer
311
+ 2024-05-13 20:50:33,159 DEBUG HandlerThread:1244775 [handler.py:handle_request():158] handle_request: defer
312
+ 2024-05-13 20:50:33,159 INFO HandlerThread:1244775 [handler.py:handle_request_defer():184] handle defer: 0
313
+ 2024-05-13 20:50:33,159 DEBUG SenderThread:1244775 [sender.py:send_request():405] send_request: defer
314
+ 2024-05-13 20:50:33,160 INFO SenderThread:1244775 [sender.py:send_request_defer():609] handle sender defer: 0
315
+ 2024-05-13 20:50:33,160 INFO SenderThread:1244775 [sender.py:transition_state():613] send defer: 1
316
+ 2024-05-13 20:50:33,160 DEBUG HandlerThread:1244775 [handler.py:handle_request():158] handle_request: defer
317
+ 2024-05-13 20:50:33,160 INFO HandlerThread:1244775 [handler.py:handle_request_defer():184] handle defer: 1
318
+ 2024-05-13 20:50:33,160 DEBUG SenderThread:1244775 [sender.py:send_request():405] send_request: defer
319
+ 2024-05-13 20:50:33,160 INFO SenderThread:1244775 [sender.py:send_request_defer():609] handle sender defer: 1
320
+ 2024-05-13 20:50:33,160 INFO SenderThread:1244775 [sender.py:transition_state():613] send defer: 2
321
+ 2024-05-13 20:50:33,160 DEBUG HandlerThread:1244775 [handler.py:handle_request():158] handle_request: defer
322
+ 2024-05-13 20:50:33,160 INFO HandlerThread:1244775 [handler.py:handle_request_defer():184] handle defer: 2
323
+ 2024-05-13 20:50:33,160 INFO HandlerThread:1244775 [system_monitor.py:finish():203] Stopping system monitor
324
+ 2024-05-13 20:50:33,161 DEBUG SystemMonitor:1244775 [system_monitor.py:_start():179] Finished system metrics aggregation loop
325
+ 2024-05-13 20:50:33,161 DEBUG SystemMonitor:1244775 [system_monitor.py:_start():183] Publishing last batch of metrics
326
+ 2024-05-13 20:50:33,161 INFO HandlerThread:1244775 [interfaces.py:finish():200] Joined cpu monitor
327
+ 2024-05-13 20:50:33,164 INFO HandlerThread:1244775 [interfaces.py:finish():200] Joined disk monitor
328
+ 2024-05-13 20:50:33,293 INFO Thread-12 :1244775 [dir_watcher.py:_on_file_created():271] file/dir created: /raid/sanchit/parler-tts-mini-v0.1-expresso-concatenated-combined/wandb/run-20240513_204652-m0g0ap7d/files/wandb-summary.json
329
+ 2024-05-13 20:50:34,293 INFO Thread-12 :1244775 [dir_watcher.py:_on_file_modified():288] file/dir modified: /raid/sanchit/parler-tts-mini-v0.1-expresso-concatenated-combined/wandb/run-20240513_204652-m0g0ap7d/files/output.log
330
+ 2024-05-13 20:50:34,797 ERROR gpu :1244775 [interfaces.py:aggregate():159] Failed to serialize metric: division by zero
331
+ 2024-05-13 20:50:34,797 INFO HandlerThread:1244775 [interfaces.py:finish():200] Joined gpu monitor
332
+ 2024-05-13 20:50:34,797 INFO HandlerThread:1244775 [interfaces.py:finish():200] Joined memory monitor
333
+ 2024-05-13 20:50:34,797 INFO HandlerThread:1244775 [interfaces.py:finish():200] Joined network monitor
334
+ 2024-05-13 20:50:34,798 DEBUG HandlerThread:1244775 [handler.py:handle_request():158] handle_request: poll_exit
335
+ 2024-05-13 20:50:34,799 DEBUG SenderThread:1244775 [sender.py:send_request():405] send_request: defer
336
+ 2024-05-13 20:50:34,799 INFO SenderThread:1244775 [sender.py:send_request_defer():609] handle sender defer: 2
337
+ 2024-05-13 20:50:34,799 INFO SenderThread:1244775 [sender.py:transition_state():613] send defer: 3
338
+ 2024-05-13 20:50:34,800 DEBUG HandlerThread:1244775 [handler.py:handle_request():158] handle_request: defer
339
+ 2024-05-13 20:50:34,800 DEBUG SenderThread:1244775 [sender.py:send():378] send: stats
340
+ 2024-05-13 20:50:34,800 INFO HandlerThread:1244775 [handler.py:handle_request_defer():184] handle defer: 3
341
+ 2024-05-13 20:50:34,800 DEBUG SenderThread:1244775 [sender.py:send_request():405] send_request: poll_exit
342
+ 2024-05-13 20:50:34,801 DEBUG SenderThread:1244775 [sender.py:send_request():405] send_request: defer
343
+ 2024-05-13 20:50:34,801 INFO SenderThread:1244775 [sender.py:send_request_defer():609] handle sender defer: 3
344
+ 2024-05-13 20:50:34,801 INFO SenderThread:1244775 [sender.py:transition_state():613] send defer: 4
345
+ 2024-05-13 20:50:34,801 DEBUG HandlerThread:1244775 [handler.py:handle_request():158] handle_request: defer
346
+ 2024-05-13 20:50:34,801 INFO HandlerThread:1244775 [handler.py:handle_request_defer():184] handle defer: 4
347
+ 2024-05-13 20:50:34,801 DEBUG SenderThread:1244775 [sender.py:send_request():405] send_request: defer
348
+ 2024-05-13 20:50:34,801 INFO SenderThread:1244775 [sender.py:send_request_defer():609] handle sender defer: 4
349
+ 2024-05-13 20:50:34,801 INFO SenderThread:1244775 [sender.py:transition_state():613] send defer: 5
350
+ 2024-05-13 20:50:34,802 DEBUG HandlerThread:1244775 [handler.py:handle_request():158] handle_request: defer
351
+ 2024-05-13 20:50:34,802 INFO HandlerThread:1244775 [handler.py:handle_request_defer():184] handle defer: 5
352
+ 2024-05-13 20:50:34,802 DEBUG SenderThread:1244775 [sender.py:send():378] send: summary
353
+ 2024-05-13 20:50:34,802 INFO SenderThread:1244775 [sender.py:_save_file():1389] saving file wandb-summary.json with policy end
354
+ 2024-05-13 20:50:34,802 DEBUG SenderThread:1244775 [sender.py:send_request():405] send_request: defer
355
+ 2024-05-13 20:50:34,802 INFO SenderThread:1244775 [sender.py:send_request_defer():609] handle sender defer: 5
356
+ 2024-05-13 20:50:34,802 INFO SenderThread:1244775 [sender.py:transition_state():613] send defer: 6
357
+ 2024-05-13 20:50:34,802 DEBUG HandlerThread:1244775 [handler.py:handle_request():158] handle_request: defer
358
+ 2024-05-13 20:50:34,802 INFO HandlerThread:1244775 [handler.py:handle_request_defer():184] handle defer: 6
359
+ 2024-05-13 20:50:34,803 DEBUG SenderThread:1244775 [sender.py:send_request():405] send_request: defer
360
+ 2024-05-13 20:50:34,803 INFO SenderThread:1244775 [sender.py:send_request_defer():609] handle sender defer: 6
361
+ 2024-05-13 20:50:34,803 INFO SenderThread:1244775 [sender.py:transition_state():613] send defer: 7
362
+ 2024-05-13 20:50:34,803 DEBUG HandlerThread:1244775 [handler.py:handle_request():158] handle_request: status_report
363
+ 2024-05-13 20:50:34,803 DEBUG HandlerThread:1244775 [handler.py:handle_request():158] handle_request: defer
364
+ 2024-05-13 20:50:34,803 INFO HandlerThread:1244775 [handler.py:handle_request_defer():184] handle defer: 7
365
+ 2024-05-13 20:50:34,803 DEBUG SenderThread:1244775 [sender.py:send_request():405] send_request: defer
366
+ 2024-05-13 20:50:34,803 INFO SenderThread:1244775 [sender.py:send_request_defer():609] handle sender defer: 7
367
+ 2024-05-13 20:50:35,159 DEBUG HandlerThread:1244775 [handler.py:handle_request():158] handle_request: poll_exit
368
+ 2024-05-13 20:50:35,294 INFO Thread-12 :1244775 [dir_watcher.py:_on_file_modified():288] file/dir modified: /raid/sanchit/parler-tts-mini-v0.1-expresso-concatenated-combined/wandb/run-20240513_204652-m0g0ap7d/files/wandb-summary.json
369
+ 2024-05-13 20:50:38,152 INFO SenderThread:1244775 [sender.py:transition_state():613] send defer: 8
370
+ 2024-05-13 20:50:38,152 DEBUG SenderThread:1244775 [sender.py:send_request():405] send_request: poll_exit
371
+ 2024-05-13 20:50:38,152 DEBUG HandlerThread:1244775 [handler.py:handle_request():158] handle_request: defer
372
+ 2024-05-13 20:50:38,153 INFO HandlerThread:1244775 [handler.py:handle_request_defer():184] handle defer: 8
373
+ 2024-05-13 20:50:38,153 DEBUG SenderThread:1244775 [sender.py:send_request():405] send_request: defer
374
+ 2024-05-13 20:50:38,153 INFO SenderThread:1244775 [sender.py:send_request_defer():609] handle sender defer: 8
375
+ 2024-05-13 20:50:38,153 INFO SenderThread:1244775 [job_builder.py:build():432] Attempting to build job artifact
376
+ 2024-05-13 20:50:38,153 INFO SenderThread:1244775 [job_builder.py:_get_source_type():565] is repo sourced job
377
+ 2024-05-13 20:50:38,160 DEBUG HandlerThread:1244775 [handler.py:handle_request():158] handle_request: poll_exit
378
+ 2024-05-13 20:50:38,179 INFO SenderThread:1244775 [job_builder.py:build():541] adding wandb-job metadata file
379
+ 2024-05-13 20:50:38,181 INFO SenderThread:1244775 [sender.py:transition_state():613] send defer: 9
380
+ 2024-05-13 20:50:38,181 DEBUG SenderThread:1244775 [sender.py:send_request():405] send_request: poll_exit
381
+ 2024-05-13 20:50:38,181 DEBUG HandlerThread:1244775 [handler.py:handle_request():158] handle_request: defer
382
+ 2024-05-13 20:50:38,182 INFO HandlerThread:1244775 [handler.py:handle_request_defer():184] handle defer: 9
383
+ 2024-05-13 20:50:38,182 DEBUG SenderThread:1244775 [sender.py:send():378] send: artifact
384
+ 2024-05-13 20:50:38,294 INFO Thread-12 :1244775 [dir_watcher.py:_on_file_modified():288] file/dir modified: /raid/sanchit/parler-tts-mini-v0.1-expresso-concatenated-combined/wandb/run-20240513_204652-m0g0ap7d/files/output.log
385
+ 2024-05-13 20:50:39,160 DEBUG HandlerThread:1244775 [handler.py:handle_request():158] handle_request: poll_exit
386
+ 2024-05-13 20:50:39,209 INFO wandb-upload_0:1244775 [upload_job.py:push():88] Uploaded file /home/sanchit/.local/share/wandb/artifacts/staging/tmp34vs1_ku
387
+ 2024-05-13 20:50:39,238 INFO wandb-upload_1:1244775 [upload_job.py:push():88] Uploaded file /tmp/tmp_ne7l6g3/wandb-job.json
388
+ 2024-05-13 20:50:40,085 INFO SenderThread:1244775 [sender.py:send_artifact():1467] sent artifact job-https___huggingface.co_sanchit-gandhi_parler-tts-mini-v0.1-expresso-concatenated-combined_run_parler_tts_training.py - {'id': 'QXJ0aWZhY3Q6ODM0NzI5NzMx', 'state': 'PENDING', 'artifactSequence': {'id': 'QXJ0aWZhY3RDb2xsZWN0aW9uOjE3NDIzMTI3NQ==', 'latestArtifact': None}}
389
+ 2024-05-13 20:50:40,086 DEBUG SenderThread:1244775 [sender.py:send_request():405] send_request: defer
390
+ 2024-05-13 20:50:40,086 INFO SenderThread:1244775 [sender.py:send_request_defer():609] handle sender defer: 9
391
+ 2024-05-13 20:50:40,086 INFO SenderThread:1244775 [dir_watcher.py:finish():358] shutting down directory watcher
392
+ 2024-05-13 20:50:40,295 INFO SenderThread:1244775 [dir_watcher.py:finish():388] scan: /raid/sanchit/parler-tts-mini-v0.1-expresso-concatenated-combined/wandb/run-20240513_204652-m0g0ap7d/files
393
+ 2024-05-13 20:50:40,295 INFO SenderThread:1244775 [dir_watcher.py:finish():402] scan save: /raid/sanchit/parler-tts-mini-v0.1-expresso-concatenated-combined/wandb/run-20240513_204652-m0g0ap7d/files/conda-environment.yaml conda-environment.yaml
394
+ 2024-05-13 20:50:40,295 INFO SenderThread:1244775 [dir_watcher.py:finish():402] scan save: /raid/sanchit/parler-tts-mini-v0.1-expresso-concatenated-combined/wandb/run-20240513_204652-m0g0ap7d/files/wandb-summary.json wandb-summary.json
395
+ 2024-05-13 20:50:40,295 INFO SenderThread:1244775 [dir_watcher.py:finish():402] scan save: /raid/sanchit/parler-tts-mini-v0.1-expresso-concatenated-combined/wandb/run-20240513_204652-m0g0ap7d/files/output.log output.log
396
+ 2024-05-13 20:50:40,297 INFO SenderThread:1244775 [dir_watcher.py:finish():402] scan save: /raid/sanchit/parler-tts-mini-v0.1-expresso-concatenated-combined/wandb/run-20240513_204652-m0g0ap7d/files/requirements.txt requirements.txt
397
+ 2024-05-13 20:50:40,300 INFO SenderThread:1244775 [dir_watcher.py:finish():402] scan save: /raid/sanchit/parler-tts-mini-v0.1-expresso-concatenated-combined/wandb/run-20240513_204652-m0g0ap7d/files/config.yaml config.yaml
398
+ 2024-05-13 20:50:40,301 INFO SenderThread:1244775 [dir_watcher.py:finish():402] scan save: /raid/sanchit/parler-tts-mini-v0.1-expresso-concatenated-combined/wandb/run-20240513_204652-m0g0ap7d/files/wandb-metadata.json wandb-metadata.json
399
+ 2024-05-13 20:50:40,301 INFO SenderThread:1244775 [sender.py:transition_state():613] send defer: 10
400
+ 2024-05-13 20:50:40,301 DEBUG SenderThread:1244775 [sender.py:send_request():405] send_request: poll_exit
401
+ 2024-05-13 20:50:40,301 DEBUG HandlerThread:1244775 [handler.py:handle_request():158] handle_request: defer
402
+ 2024-05-13 20:50:40,302 INFO HandlerThread:1244775 [handler.py:handle_request_defer():184] handle defer: 10
403
+ 2024-05-13 20:50:40,306 DEBUG SenderThread:1244775 [sender.py:send_request():405] send_request: defer
404
+ 2024-05-13 20:50:40,306 INFO SenderThread:1244775 [sender.py:send_request_defer():609] handle sender defer: 10
405
+ 2024-05-13 20:50:40,306 INFO SenderThread:1244775 [file_pusher.py:finish():169] shutting down file pusher
406
+ 2024-05-13 20:50:40,632 INFO wandb-upload_0:1244775 [upload_job.py:push():130] Uploaded file /raid/sanchit/parler-tts-mini-v0.1-expresso-concatenated-combined/wandb/run-20240513_204652-m0g0ap7d/files/conda-environment.yaml
407
+ 2024-05-13 20:50:40,639 INFO wandb-upload_1:1244775 [upload_job.py:push():130] Uploaded file /raid/sanchit/parler-tts-mini-v0.1-expresso-concatenated-combined/wandb/run-20240513_204652-m0g0ap7d/files/output.log
408
+ 2024-05-13 20:50:40,758 INFO wandb-upload_3:1244775 [upload_job.py:push():130] Uploaded file /raid/sanchit/parler-tts-mini-v0.1-expresso-concatenated-combined/wandb/run-20240513_204652-m0g0ap7d/files/requirements.txt
409
+ 2024-05-13 20:50:40,773 INFO wandb-upload_2:1244775 [upload_job.py:push():130] Uploaded file /raid/sanchit/parler-tts-mini-v0.1-expresso-concatenated-combined/wandb/run-20240513_204652-m0g0ap7d/files/wandb-summary.json
410
+ 2024-05-13 20:50:40,918 INFO wandb-upload_4:1244775 [upload_job.py:push():130] Uploaded file /raid/sanchit/parler-tts-mini-v0.1-expresso-concatenated-combined/wandb/run-20240513_204652-m0g0ap7d/files/config.yaml
411
+ 2024-05-13 20:50:41,118 INFO Thread-11 (_thread_body):1244775 [sender.py:transition_state():613] send defer: 11
412
+ 2024-05-13 20:50:41,119 DEBUG HandlerThread:1244775 [handler.py:handle_request():158] handle_request: defer
413
+ 2024-05-13 20:50:41,119 INFO HandlerThread:1244775 [handler.py:handle_request_defer():184] handle defer: 11
414
+ 2024-05-13 20:50:41,119 DEBUG SenderThread:1244775 [sender.py:send_request():405] send_request: defer
415
+ 2024-05-13 20:50:41,119 INFO SenderThread:1244775 [sender.py:send_request_defer():609] handle sender defer: 11
416
+ 2024-05-13 20:50:41,120 INFO SenderThread:1244775 [file_pusher.py:join():175] waiting for file pusher
417
+ 2024-05-13 20:50:41,120 INFO SenderThread:1244775 [sender.py:transition_state():613] send defer: 12
418
+ 2024-05-13 20:50:41,120 DEBUG HandlerThread:1244775 [handler.py:handle_request():158] handle_request: defer
419
+ 2024-05-13 20:50:41,120 INFO HandlerThread:1244775 [handler.py:handle_request_defer():184] handle defer: 12
420
+ 2024-05-13 20:50:41,120 DEBUG SenderThread:1244775 [sender.py:send_request():405] send_request: defer
421
+ 2024-05-13 20:50:41,120 INFO SenderThread:1244775 [sender.py:send_request_defer():609] handle sender defer: 12
422
+ 2024-05-13 20:50:41,120 INFO SenderThread:1244775 [file_stream.py:finish():601] file stream finish called
423
+ 2024-05-13 20:50:41,161 DEBUG HandlerThread:1244775 [handler.py:handle_request():158] handle_request: poll_exit
424
+ 2024-05-13 20:50:41,510 INFO SenderThread:1244775 [file_stream.py:finish():605] file stream finish is done
425
+ 2024-05-13 20:50:41,510 INFO SenderThread:1244775 [sender.py:transition_state():613] send defer: 13
426
+ 2024-05-13 20:50:41,510 DEBUG SenderThread:1244775 [sender.py:send_request():405] send_request: poll_exit
427
+ 2024-05-13 20:50:41,510 DEBUG HandlerThread:1244775 [handler.py:handle_request():158] handle_request: defer
428
+ 2024-05-13 20:50:41,510 INFO HandlerThread:1244775 [handler.py:handle_request_defer():184] handle defer: 13
429
+ 2024-05-13 20:50:41,510 DEBUG SenderThread:1244775 [sender.py:send_request():405] send_request: defer
430
+ 2024-05-13 20:50:41,510 INFO SenderThread:1244775 [sender.py:send_request_defer():609] handle sender defer: 13
431
+ 2024-05-13 20:50:41,510 INFO SenderThread:1244775 [sender.py:transition_state():613] send defer: 14
432
+ 2024-05-13 20:50:41,511 DEBUG HandlerThread:1244775 [handler.py:handle_request():158] handle_request: defer
433
+ 2024-05-13 20:50:41,511 INFO HandlerThread:1244775 [handler.py:handle_request_defer():184] handle defer: 14
434
+ 2024-05-13 20:50:41,511 DEBUG SenderThread:1244775 [sender.py:send():378] send: final
435
+ 2024-05-13 20:50:41,511 DEBUG SenderThread:1244775 [sender.py:send():378] send: footer
436
+ 2024-05-13 20:50:41,511 DEBUG SenderThread:1244775 [sender.py:send_request():405] send_request: defer
437
+ 2024-05-13 20:50:41,511 INFO SenderThread:1244775 [sender.py:send_request_defer():609] handle sender defer: 14
438
+ 2024-05-13 20:50:41,512 DEBUG HandlerThread:1244775 [handler.py:handle_request():158] handle_request: poll_exit
439
+ 2024-05-13 20:50:41,512 DEBUG SenderThread:1244775 [sender.py:send_request():405] send_request: poll_exit
440
+ 2024-05-13 20:50:41,512 DEBUG HandlerThread:1244775 [handler.py:handle_request():158] handle_request: poll_exit
441
+ 2024-05-13 20:50:41,512 DEBUG SenderThread:1244775 [sender.py:send_request():405] send_request: poll_exit
442
+ 2024-05-13 20:50:41,513 DEBUG HandlerThread:1244775 [handler.py:handle_request():158] handle_request: server_info
443
+ 2024-05-13 20:50:41,513 DEBUG HandlerThread:1244775 [handler.py:handle_request():158] handle_request: get_summary
444
+ 2024-05-13 20:50:41,513 DEBUG SenderThread:1244775 [sender.py:send_request():405] send_request: server_info
445
+ 2024-05-13 20:50:41,515 DEBUG HandlerThread:1244775 [handler.py:handle_request():158] handle_request: sampled_history
446
+ 2024-05-13 20:50:41,515 DEBUG HandlerThread:1244775 [handler.py:handle_request():158] handle_request: internal_messages
447
+ 2024-05-13 20:50:41,651 INFO MainThread:1244775 [wandb_run.py:_footer_history_summary_info():3994] rendering history
448
+ 2024-05-13 20:50:41,651 INFO MainThread:1244775 [wandb_run.py:_footer_history_summary_info():4026] rendering summary
449
+ 2024-05-13 20:50:41,651 INFO MainThread:1244775 [wandb_run.py:_footer_sync_info():3953] logging synced files
450
+ 2024-05-13 20:50:41,651 DEBUG HandlerThread:1244775 [handler.py:handle_request():158] handle_request: shutdown
451
+ 2024-05-13 20:50:41,651 INFO HandlerThread:1244775 [handler.py:finish():882] shutting down handler
452
+ 2024-05-13 20:50:42,513 INFO WriterThread:1244775 [datastore.py:close():296] close: /raid/sanchit/parler-tts-mini-v0.1-expresso-concatenated-combined/wandb/run-20240513_204652-m0g0ap7d/run-m0g0ap7d.wandb
453
+ 2024-05-13 20:50:42,651 INFO SenderThread:1244775 [sender.py:finish():1545] shutting down sender
454
+ 2024-05-13 20:50:42,651 INFO SenderThread:1244775 [file_pusher.py:finish():169] shutting down file pusher
455
+ 2024-05-13 20:50:42,651 INFO SenderThread:1244775 [file_pusher.py:join():175] waiting for file pusher
wandb/run-20240513_204652-m0g0ap7d/logs/debug.log ADDED
@@ -0,0 +1,29 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ 2024-05-13 20:46:52,817 INFO MainThread:1244577 [wandb_setup.py:_flush():76] Current SDK version is 0.17.0
2
+ 2024-05-13 20:46:52,817 INFO MainThread:1244577 [wandb_setup.py:_flush():76] Configure stats pid to 1244577
3
+ 2024-05-13 20:46:52,817 INFO MainThread:1244577 [wandb_setup.py:_flush():76] Loading settings from /home/sanchit/.config/wandb/settings
4
+ 2024-05-13 20:46:52,817 INFO MainThread:1244577 [wandb_setup.py:_flush():76] Loading settings from /raid/sanchit/parler-tts-mini-v0.1-expresso-concatenated-combined/wandb/settings
5
+ 2024-05-13 20:46:52,817 INFO MainThread:1244577 [wandb_setup.py:_flush():76] Loading settings from environment variables: {}
6
+ 2024-05-13 20:46:52,817 INFO MainThread:1244577 [wandb_setup.py:_flush():76] Applying setup settings: {'_disable_service': False}
7
+ 2024-05-13 20:46:52,817 INFO MainThread:1244577 [wandb_setup.py:_flush():76] Inferring run settings from compute environment: {'program_relpath': 'run_parler_tts_training.py', 'program_abspath': '/raid/sanchit/parler-tts-mini-v0.1-expresso-concatenated-combined/run_parler_tts_training.py', 'program': '/raid/sanchit/parler-tts-mini-v0.1-expresso-concatenated-combined/run_parler_tts_training.py'}
8
+ 2024-05-13 20:46:52,817 INFO MainThread:1244577 [wandb_setup.py:_flush():76] Applying login settings: {}
9
+ 2024-05-13 20:46:52,817 INFO MainThread:1244577 [wandb_init.py:_log_setup():520] Logging user logs to /raid/sanchit/parler-tts-mini-v0.1-expresso-concatenated-combined/wandb/run-20240513_204652-m0g0ap7d/logs/debug.log
10
+ 2024-05-13 20:46:52,818 INFO MainThread:1244577 [wandb_init.py:_log_setup():521] Logging internal logs to /raid/sanchit/parler-tts-mini-v0.1-expresso-concatenated-combined/wandb/run-20240513_204652-m0g0ap7d/logs/debug-internal.log
11
+ 2024-05-13 20:46:52,818 INFO MainThread:1244577 [wandb_init.py:init():560] calling init triggers
12
+ 2024-05-13 20:46:52,818 INFO MainThread:1244577 [wandb_init.py:init():567] wandb.init called with sweep_config: {}
13
+ config: {}
14
+ 2024-05-13 20:46:52,818 INFO MainThread:1244577 [wandb_init.py:init():610] starting backend
15
+ 2024-05-13 20:46:52,818 INFO MainThread:1244577 [wandb_init.py:init():614] setting up manager
16
+ 2024-05-13 20:46:52,821 INFO MainThread:1244577 [backend.py:_multiprocessing_setup():105] multiprocessing start_methods=fork,spawn,forkserver, using: spawn
17
+ 2024-05-13 20:46:52,822 INFO MainThread:1244577 [wandb_init.py:init():622] backend started and connected
18
+ 2024-05-13 20:46:52,823 INFO MainThread:1244577 [wandb_init.py:init():711] updated telemetry
19
+ 2024-05-13 20:46:52,826 INFO MainThread:1244577 [wandb_init.py:init():744] communicating run to backend with 90.0 second timeout
20
+ 2024-05-13 20:46:53,228 INFO MainThread:1244577 [wandb_run.py:_on_init():2396] communicating current version
21
+ 2024-05-13 20:46:53,287 INFO MainThread:1244577 [wandb_run.py:_on_init():2405] got version response
22
+ 2024-05-13 20:46:53,287 INFO MainThread:1244577 [wandb_init.py:init():795] starting run threads in backend
23
+ 2024-05-13 20:46:55,863 INFO MainThread:1244577 [wandb_run.py:_console_start():2374] atexit reg
24
+ 2024-05-13 20:46:55,863 INFO MainThread:1244577 [wandb_run.py:_redirect():2229] redirect: wrap_raw
25
+ 2024-05-13 20:46:55,863 INFO MainThread:1244577 [wandb_run.py:_redirect():2294] Wrapping output streams.
26
+ 2024-05-13 20:46:55,863 INFO MainThread:1244577 [wandb_run.py:_redirect():2319] Redirects installed.
27
+ 2024-05-13 20:46:55,864 INFO MainThread:1244577 [wandb_init.py:init():838] run started, returning control to user process
28
+ 2024-05-13 20:46:55,864 INFO MainThread:1244577 [wandb_run.py:_config_callback():1376] config_cb None None {'learning_rate': 8e-05, 'model_name_or_path': 'parler-tts/parler_tts_mini_v0.1', 'num_train_epochs': 8, 'gradient_accumulation_steps': 8, 'per_device_train_batch_size': 16, 'global_batch_size': 16, 'mixed_precision': 'bf16', 'lr_scheduler_type': 'SchedulerType.COSINE', 'warmup_steps': 250, 'freeze_text_encoder': True, 'max_duration_in_seconds': 30.0, 'weight_decay': 0.01, 'adam_beta1': 0.9, 'adam_beta2': 0.99, 'temperature': 1.0}
29
+ 2024-05-13 20:50:42,653 WARNING MsgRouterThr:1244577 [router.py:message_loop():77] message_loop has been closed
wandb/run-20240513_204652-m0g0ap7d/run-m0g0ap7d.wandb ADDED
Binary file (45 kB). View file
 
wandb/run-20240513_205249-qaoje1x9/files/conda-environment.yaml ADDED
@@ -0,0 +1,248 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ name: venv
2
+ channels:
3
+ - defaults
4
+ dependencies:
5
+ - _libgcc_mutex=0.1=main
6
+ - _openmp_mutex=5.1=1_gnu
7
+ - bzip2=1.0.8=h5eee18b_6
8
+ - ca-certificates=2024.3.11=h06a4308_0
9
+ - ld_impl_linux-64=2.38=h1181459_1
10
+ - libffi=3.4.4=h6a678d5_1
11
+ - libgcc-ng=11.2.0=h1234567_1
12
+ - libgomp=11.2.0=h1234567_1
13
+ - libstdcxx-ng=11.2.0=h1234567_1
14
+ - libuuid=1.41.5=h5eee18b_0
15
+ - ncurses=6.4=h6a678d5_0
16
+ - openssl=3.0.13=h7f8727e_1
17
+ - pip=24.0=py311h06a4308_0
18
+ - python=3.11.9=h955ad1f_0
19
+ - readline=8.2=h5eee18b_0
20
+ - setuptools=69.5.1=py311h06a4308_0
21
+ - sqlite=3.45.3=h5eee18b_0
22
+ - tk=8.6.14=h39e8969_0
23
+ - wheel=0.43.0=py311h06a4308_0
24
+ - xz=5.4.6=h5eee18b_1
25
+ - zlib=1.2.13=h5eee18b_1
26
+ - pip:
27
+ - absl-py==2.1.0
28
+ - accelerate==0.30.0
29
+ - aiohttp==3.9.5
30
+ - aiosignal==1.3.1
31
+ - aniso8601==9.0.1
32
+ - annotated-types==0.6.0
33
+ - anyio==4.3.0
34
+ - argbind==0.3.7
35
+ - argon2-cffi==23.1.0
36
+ - argon2-cffi-bindings==21.2.0
37
+ - arrow==1.3.0
38
+ - asttokens==2.4.1
39
+ - async-lru==2.0.4
40
+ - attrs==23.2.0
41
+ - audioread==3.0.1
42
+ - babel==2.15.0
43
+ - beautifulsoup4==4.12.3
44
+ - bidict==0.23.1
45
+ - bitsandbytes==0.43.1
46
+ - bleach==6.1.0
47
+ - certifi==2024.2.2
48
+ - cffi==1.16.0
49
+ - charset-normalizer==3.3.2
50
+ - click==8.1.7
51
+ - coloredlogs==14.0
52
+ - comm==0.2.2
53
+ - contourpy==1.2.1
54
+ - cycler==0.12.1
55
+ - datasets==2.19.1
56
+ - debugpy==1.8.1
57
+ - decorator==5.1.1
58
+ - defusedxml==0.7.1
59
+ - descript-audio-codec==1.0.0
60
+ - descript-audiotools==0.7.2
61
+ - dill==0.3.8
62
+ - dnspython==2.3.0
63
+ - docker-pycreds==0.4.0
64
+ - docstring-parser==0.16
65
+ - editdistance==0.8.1
66
+ - einops==0.8.0
67
+ - et-xmlfile==1.1.0
68
+ - evaluate==0.4.2
69
+ - eventlet==0.36.1
70
+ - executing==2.0.1
71
+ - fastjsonschema==2.19.1
72
+ - ffmpy==0.3.2
73
+ - filelock==3.14.0
74
+ - fire==0.6.0
75
+ - flask==2.2.5
76
+ - flask-cors==4.0.1
77
+ - flask-restful==0.3.10
78
+ - flask-socketio==5.3.6
79
+ - flask-talisman==1.1.0
80
+ - flatten-dict==0.4.2
81
+ - fonttools==4.51.0
82
+ - fqdn==1.5.1
83
+ - frozenlist==1.4.1
84
+ - fsspec==2024.3.1
85
+ - future==1.0.0
86
+ - g2p==2.0.0
87
+ - gitdb==4.0.11
88
+ - gitpython==3.1.43
89
+ - greenlet==3.0.3
90
+ - grpcio==1.63.0
91
+ - h11==0.14.0
92
+ - httpcore==1.0.5
93
+ - httpx==0.27.0
94
+ - huggingface-hub==0.23.0
95
+ - humanfriendly==10.0
96
+ - idna==3.7
97
+ - importlib-resources==6.4.0
98
+ - ipdb==0.13.13
99
+ - ipykernel==6.29.4
100
+ - ipython==8.24.0
101
+ - isoduration==20.11.0
102
+ - itsdangerous==2.2.0
103
+ - jedi==0.19.1
104
+ - jinja2==3.1.4
105
+ - jiwer==3.0.4
106
+ - joblib==1.4.2
107
+ - json5==0.9.25
108
+ - jsonpointer==2.4
109
+ - jsonschema==4.22.0
110
+ - jsonschema-specifications==2023.12.1
111
+ - julius==0.2.7
112
+ - jupyter-client==8.6.1
113
+ - jupyter-core==5.7.2
114
+ - jupyter-events==0.10.0
115
+ - jupyter-lsp==2.2.5
116
+ - jupyter-server==2.14.0
117
+ - jupyter-server-terminals==0.5.3
118
+ - jupyterlab==4.2.0
119
+ - jupyterlab-pygments==0.3.0
120
+ - jupyterlab-server==2.27.1
121
+ - kiwisolver==1.4.5
122
+ - lazy-loader==0.4
123
+ - librosa==0.10.2
124
+ - llvmlite==0.42.0
125
+ - markdown==3.6
126
+ - markdown-it-py==3.0.0
127
+ - markdown2==2.4.13
128
+ - markupsafe==2.1.5
129
+ - matplotlib==3.8.4
130
+ - matplotlib-inline==0.1.7
131
+ - mdurl==0.1.2
132
+ - mistune==3.0.2
133
+ - mpmath==1.3.0
134
+ - msgpack==1.0.8
135
+ - multidict==6.0.5
136
+ - multiprocess==0.70.16
137
+ - munkres==1.1.4
138
+ - nbclient==0.10.0
139
+ - nbconvert==7.16.4
140
+ - nbformat==5.10.4
141
+ - nest-asyncio==1.6.0
142
+ - networkx==3.3
143
+ - notebook-shim==0.2.4
144
+ - numba==0.59.1
145
+ - numpy==1.26.4
146
+ - nvidia-cublas-cu12==12.1.3.1
147
+ - nvidia-cuda-cupti-cu12==12.1.105
148
+ - nvidia-cuda-nvrtc-cu12==12.1.105
149
+ - nvidia-cuda-runtime-cu12==12.1.105
150
+ - nvidia-cudnn-cu12==8.9.2.26
151
+ - nvidia-cufft-cu12==11.0.2.54
152
+ - nvidia-curand-cu12==10.3.2.106
153
+ - nvidia-cusolver-cu12==11.4.5.107
154
+ - nvidia-cusparse-cu12==12.1.0.106
155
+ - nvidia-nccl-cu12==2.20.5
156
+ - nvidia-nvjitlink-cu12==12.4.127
157
+ - nvidia-nvtx-cu12==12.1.105
158
+ - openpyxl==3.1.2
159
+ - overrides==7.7.0
160
+ - packaging==24.0
161
+ - pandas==2.2.2
162
+ - pandocfilters==1.5.1
163
+ - panphon==0.20.0
164
+ - parler-tts==0.1
165
+ - parso==0.8.4
166
+ - pexpect==4.9.0
167
+ - pillow==10.3.0
168
+ - platformdirs==4.2.1
169
+ - pooch==1.8.1
170
+ - prometheus-client==0.20.0
171
+ - prompt-toolkit==3.0.43
172
+ - protobuf==3.19.6
173
+ - psutil==5.9.8
174
+ - ptyprocess==0.7.0
175
+ - pure-eval==0.2.2
176
+ - pyarrow==16.0.0
177
+ - pyarrow-hotfix==0.6
178
+ - pycparser==2.22
179
+ - pydantic==2.7.1
180
+ - pydantic-core==2.18.2
181
+ - pygments==2.18.0
182
+ - pyloudnorm==0.1.1
183
+ - pyparsing==3.1.2
184
+ - pystoi==0.4.1
185
+ - python-dateutil==2.9.0.post0
186
+ - python-engineio==4.9.0
187
+ - python-json-logger==2.0.7
188
+ - python-socketio==5.11.2
189
+ - pytz==2024.1
190
+ - pyyaml==6.0.1
191
+ - pyzmq==26.0.3
192
+ - randomname==0.2.1
193
+ - rapidfuzz==3.9.0
194
+ - referencing==0.35.1
195
+ - regex==2024.4.28
196
+ - requests==2.31.0
197
+ - rfc3339-validator==0.1.4
198
+ - rfc3986-validator==0.1.1
199
+ - rich==13.7.1
200
+ - rpds-py==0.18.1
201
+ - safetensors==0.4.3
202
+ - scikit-learn==1.4.2
203
+ - scipy==1.13.0
204
+ - send2trash==1.8.3
205
+ - sentencepiece==0.2.0
206
+ - sentry-sdk==2.1.1
207
+ - setproctitle==1.3.3
208
+ - simple-websocket==1.0.0
209
+ - six==1.16.0
210
+ - smmap==5.0.1
211
+ - sniffio==1.3.1
212
+ - soundfile==0.12.1
213
+ - soupsieve==2.5
214
+ - soxr==0.3.7
215
+ - stack-data==0.6.3
216
+ - sympy==1.12
217
+ - tensorboard==2.16.2
218
+ - tensorboard-data-server==0.7.2
219
+ - termcolor==2.4.0
220
+ - terminado==0.18.1
221
+ - text-unidecode==1.3
222
+ - threadpoolctl==3.5.0
223
+ - tinycss2==1.3.0
224
+ - tokenizers==0.19.1
225
+ - torch==2.3.0
226
+ - torch-stoi==0.2.1
227
+ - torchaudio==2.3.0
228
+ - tornado==6.4
229
+ - tqdm==4.66.4
230
+ - traitlets==5.14.3
231
+ - transformers==4.41.0.dev0
232
+ - triton==2.3.0
233
+ - types-python-dateutil==2.9.0.20240316
234
+ - typing-extensions==4.11.0
235
+ - tzdata==2024.1
236
+ - unicodecsv==0.14.1
237
+ - uri-template==1.3.0
238
+ - urllib3==2.2.1
239
+ - wandb==0.17.0
240
+ - wcwidth==0.2.13
241
+ - webcolors==1.13
242
+ - webencodings==0.5.1
243
+ - websocket-client==1.8.0
244
+ - werkzeug==3.0.3
245
+ - wsproto==1.2.0
246
+ - xxhash==3.4.1
247
+ - yarl==1.9.4
248
+ prefix: /home/sanchit/miniconda3/envs/venv
wandb/run-20240513_205249-qaoje1x9/files/config.yaml ADDED
@@ -0,0 +1,88 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ wandb_version: 1
2
+
3
+ _wandb:
4
+ desc: null
5
+ value:
6
+ python_version: 3.11.9
7
+ cli_version: 0.17.0
8
+ framework: huggingface
9
+ huggingface_version: 4.41.0.dev0
10
+ is_jupyter_run: false
11
+ is_kaggle_kernel: false
12
+ start_time: 1715626369
13
+ t:
14
+ 1:
15
+ - 1
16
+ - 5
17
+ - 11
18
+ - 49
19
+ - 51
20
+ - 53
21
+ - 55
22
+ - 71
23
+ - 100
24
+ 2:
25
+ - 1
26
+ - 5
27
+ - 11
28
+ - 49
29
+ - 51
30
+ - 53
31
+ - 55
32
+ - 71
33
+ - 100
34
+ 3:
35
+ - 2
36
+ - 23
37
+ - 61
38
+ 4: 3.11.9
39
+ 5: 0.17.0
40
+ 6: 4.41.0.dev0
41
+ 8:
42
+ - 5
43
+ 13: linux-x86_64
44
+ learning_rate:
45
+ desc: null
46
+ value: 8.0e-05
47
+ model_name_or_path:
48
+ desc: null
49
+ value: parler-tts/parler_tts_mini_v0.1
50
+ num_train_epochs:
51
+ desc: null
52
+ value: 8
53
+ gradient_accumulation_steps:
54
+ desc: null
55
+ value: 8
56
+ per_device_train_batch_size:
57
+ desc: null
58
+ value: 16
59
+ global_batch_size:
60
+ desc: null
61
+ value: 16
62
+ mixed_precision:
63
+ desc: null
64
+ value: bf16
65
+ lr_scheduler_type:
66
+ desc: null
67
+ value: SchedulerType.COSINE
68
+ warmup_steps:
69
+ desc: null
70
+ value: 250
71
+ freeze_text_encoder:
72
+ desc: null
73
+ value: true
74
+ max_duration_in_seconds:
75
+ desc: null
76
+ value: 30.0
77
+ weight_decay:
78
+ desc: null
79
+ value: 0.01
80
+ adam_beta1:
81
+ desc: null
82
+ value: 0.9
83
+ adam_beta2:
84
+ desc: null
85
+ value: 0.99
86
+ temperature:
87
+ desc: null
88
+ value: 1.0
wandb/run-20240513_205249-qaoje1x9/files/media/audio/Speech samples/eval_1016_193b029d494fd24e7cfa.wav ADDED
Binary file (962 kB). View file
 
wandb/run-20240513_205249-qaoje1x9/files/media/audio/Speech samples/eval_1016_345bfb6a72849809d361.wav ADDED
Binary file (962 kB). View file
 
wandb/run-20240513_205249-qaoje1x9/files/media/audio/Speech samples/eval_1016_3c9adbd9374e0fb5ce3d.wav ADDED
Binary file (962 kB). View file
 
wandb/run-20240513_205249-qaoje1x9/files/media/audio/Speech samples/eval_1016_3cd94e4824cc6c8fb09c.wav ADDED
Binary file (962 kB). View file
 
wandb/run-20240513_205249-qaoje1x9/files/media/audio/Speech samples/eval_1016_3ed6544e58dd861a5d9e.wav ADDED
Binary file (962 kB). View file
 
wandb/run-20240513_205249-qaoje1x9/files/media/audio/Speech samples/eval_1016_9da1fed11be9d614d9ec.wav ADDED
Binary file (962 kB). View file
 
wandb/run-20240513_205249-qaoje1x9/files/media/audio/Speech samples/eval_1016_ec838b0233dbe87d33f3.wav ADDED
Binary file (962 kB). View file
 
wandb/run-20240513_205249-qaoje1x9/files/media/audio/Speech samples/eval_127_0db946e177a69cbe11f5.wav ADDED
Binary file (842 kB). View file
 
wandb/run-20240513_205249-qaoje1x9/files/media/audio/Speech samples/eval_127_32c9af8d48e757598000.wav ADDED
Binary file (842 kB). View file
 
wandb/run-20240513_205249-qaoje1x9/files/media/audio/Speech samples/eval_127_341c52fd92336c009f67.wav ADDED
Binary file (842 kB). View file
 
wandb/run-20240513_205249-qaoje1x9/files/media/audio/Speech samples/eval_127_43ed5d3749c912acb591.wav ADDED
Binary file (842 kB). View file
 
wandb/run-20240513_205249-qaoje1x9/files/media/audio/Speech samples/eval_127_75818e76e9e077f058be.wav ADDED
Binary file (842 kB). View file
 
wandb/run-20240513_205249-qaoje1x9/files/media/audio/Speech samples/eval_127_d24330f3382b9e6ea7ea.wav ADDED
Binary file (842 kB). View file
 
wandb/run-20240513_205249-qaoje1x9/files/media/audio/Speech samples/eval_127_ec7dcb5421538131ede7.wav ADDED
Binary file (842 kB). View file
 
wandb/run-20240513_205249-qaoje1x9/files/media/audio/Speech samples/eval_254_2794dcaf322bd12e2814.wav ADDED
Binary file (810 kB). View file
 
wandb/run-20240513_205249-qaoje1x9/files/media/audio/Speech samples/eval_254_2ef5b33e2eaf98dca4a6.wav ADDED
Binary file (810 kB). View file
 
wandb/run-20240513_205249-qaoje1x9/files/media/audio/Speech samples/eval_254_4ca836a112634417b82e.wav ADDED
Binary file (810 kB). View file
 
wandb/run-20240513_205249-qaoje1x9/files/media/audio/Speech samples/eval_254_837a3499e3f93538b643.wav ADDED
Binary file (810 kB). View file
 
wandb/run-20240513_205249-qaoje1x9/files/media/audio/Speech samples/eval_254_b3650df61e399b05257d.wav ADDED
Binary file (810 kB). View file
 
wandb/run-20240513_205249-qaoje1x9/files/media/audio/Speech samples/eval_254_d33ccbefe990db0dce2b.wav ADDED
Binary file (810 kB). View file
 
wandb/run-20240513_205249-qaoje1x9/files/media/audio/Speech samples/eval_254_e8ca5038019cad3cde86.wav ADDED
Binary file (810 kB). View file
 
wandb/run-20240513_205249-qaoje1x9/files/media/audio/Speech samples/eval_381_4899e0da4615e883ad13.wav ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4899e0da4615e883ad137c4ebecee7d753f2512e69b2610ee121046fafa228a2
3
+ size 1282092
wandb/run-20240513_205249-qaoje1x9/files/media/audio/Speech samples/eval_381_492597073098578f0605.wav ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:492597073098578f0605b080944b218a8c03fd49715aa37e5b6b9f2bbdc975fe
3
+ size 1282092
wandb/run-20240513_205249-qaoje1x9/files/media/audio/Speech samples/eval_381_67d9409a306e3614ec3f.wav ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:67d9409a306e3614ec3f84ea766046f8674e3cd4a8b13d1231dd6a356164f6d3
3
+ size 1282092
wandb/run-20240513_205249-qaoje1x9/files/media/audio/Speech samples/eval_381_7c47ba927ac118ffaacc.wav ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7c47ba927ac118ffaacca70183d37077fd7a6d6fc2c649e51b7fb5137609cd8e
3
+ size 1282092
wandb/run-20240513_205249-qaoje1x9/files/media/audio/Speech samples/eval_381_89ad32d31f3e70178cc1.wav ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:89ad32d31f3e70178cc19b1fb1c8ceda0377b104575def497602c02ba8bd1196
3
+ size 1282092
wandb/run-20240513_205249-qaoje1x9/files/media/audio/Speech samples/eval_381_cd644667186ae0518a3c.wav ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:cd644667186ae0518a3c08763c4aabf1e1f4d5754f3025b1e40779e8f068d489
3
+ size 1282092
wandb/run-20240513_205249-qaoje1x9/files/media/audio/Speech samples/eval_381_f7405ef7b645b3265477.wav ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f7405ef7b645b3265477b3e4e65bf1313dbc4445631e516872a054a694bb5e4a
3
+ size 1282092
wandb/run-20240513_205249-qaoje1x9/files/media/audio/Speech samples/eval_508_00e9064c0bdbd6b9428d.wav ADDED
Binary file (996 kB). View file
 
wandb/run-20240513_205249-qaoje1x9/files/media/audio/Speech samples/eval_508_11adac906cb7e2ef30c6.wav ADDED
Binary file (996 kB). View file
 
wandb/run-20240513_205249-qaoje1x9/files/media/audio/Speech samples/eval_508_5619d97860f92fc1a62d.wav ADDED
Binary file (996 kB). View file
 
wandb/run-20240513_205249-qaoje1x9/files/media/audio/Speech samples/eval_508_bcb03b95f0470920bdc6.wav ADDED
Binary file (996 kB). View file
 
wandb/run-20240513_205249-qaoje1x9/files/media/audio/Speech samples/eval_508_e48b4ff2b12d5ffdb11c.wav ADDED
Binary file (996 kB). View file