Dionyssos commited on
Commit
33b0763
1 Parent(s): bb8414f

visuals draft

Browse files
Files changed (1) hide show
  1. mimic3_make_harvard_sentences.py +170 -124
mimic3_make_harvard_sentences.py CHANGED
@@ -21,7 +21,7 @@ from mimic3_tts.__main__ import (CommandLineInterfaceState,
21
  shutdown_tts,
22
  OutputNaming,
23
  process_line)
24
- import msinference
25
  import time
26
  import json
27
  import pandas as pd
@@ -79,9 +79,9 @@ LABELS = [
79
  ]
80
 
81
 
82
- args = transformers.Wav2Vec2Config() #finetuning_task='spef2feat_reg')
83
- args.dev = torch.device('cuda:0')
84
- args.dev2 = torch.device('cuda:0')
85
  def _softmax(x):
86
  '''x : (batch, num_class)'''
87
  x -= x.max(1, keepdims=True) # if all -400 then sum(exp(x)) = 0
@@ -114,7 +114,7 @@ def _infer(self, x):
114
  teacher_cat = AutoModelForAudioClassification.from_pretrained(
115
  '3loi/SER-Odyssey-Baseline-WavLM-Categorical-Attributes',
116
  trust_remote_code=True # fun definitions see 3loi/SER-.. repo
117
- ).to(args.dev2).eval()
118
  teacher_cat.forward = types.MethodType(_infer, teacher_cat)
119
 
120
 
@@ -136,7 +136,7 @@ def process_function(x, sampling_rate, idx):
136
  #{0: 'Angry', 1: 'Sad', 2: 'Happy', 3: 'Surprise',
137
  #4: 'Fear', 5: 'Disgust', 6: 'Contempt', 7: 'Neutral'}
138
  #tensor([[0.0015, 0.3651, 0.0593, 0.0315, 0.0600, 0.0125, 0.0319, 0.4382]])
139
- logits_cat = teacher_cat(torch.from_numpy(x).to(args.dev)).cpu().detach().numpy()
140
  # USE ALL CATEGORIES
141
  # --
142
  # logits_audioset = audioset_model(x, 16000)['logits_sounds']
@@ -162,7 +162,7 @@ interface = audinterface.Feature(
162
  resample=True,
163
  verbose=True,
164
  )
165
- # ======================================== END INTERFACE
166
 
167
 
168
 
@@ -272,120 +272,6 @@ Path(foreign_dir).mkdir(parents=True, exist_ok=True)
272
 
273
 
274
 
275
- # # synth 767
276
- # for _id, _voice in enumerate(foreign_voices):
277
- # _str = _voice.replace('/', '_').replace('#', '_').replace('_low', '')
278
- # if 'cmu-arctic' in _str:
279
- # _str = _str.replace('cmu-arctic', 'cmu_arctic') #+ '.wav'
280
-
281
- # print('\n\n\n\nExecuting', _voice,'\n\n\n\n\n')
282
-
283
- # if (
284
- # not os.path.isfile(foreign_dir + 'mimic3__' + _str + '.wav') or
285
- # not os.path.isfile(foreign_dir + 'styletts2__' + _str + '.wav')
286
- # ):
287
-
288
- # # Mimic3 GitHub Quota exceded:
289
- # # https://github.com/MycroftAI/mimic3-voices
290
- # # Above repo can exceed download quota of LFS
291
- # # Copy mimic-voices from local copies
292
- # # clone https://huggingface.co/mukowaty/mimic3-voices/tree/main/voices
293
- # # copy to ~/
294
- # #
295
- # #
296
- # home_voice_dir = f'/home/audeering.local/dkounadis/.local/share/mycroft/mimic3/voices/{_voice.split("#")[0]}/'
297
- # Path(home_voice_dir).mkdir(parents=True, exist_ok=True)
298
- # speaker_free_voice_name = _voice.split("#")[0] if '#' in _voice else _voice
299
-
300
-
301
- # if (
302
- # (not os.path.isfile(home_voice_dir + 'generator.onnx')) or
303
- # (os.path.getsize(home_voice_dir + 'generator.onnx') < 500) # .onnx - is just LFS header
304
- # ):
305
-
306
- # # Copy
307
-
308
- # shutil.copyfile(
309
- # f'/data/dkounadis/mimic3-voices/voices/{speaker_free_voice_name}/generator.onnx',
310
- # home_voice_dir + 'generator.onnx')
311
-
312
-
313
-
314
- # # pre made
315
- # prompt_path = 'mimic3_foreign_4x/' + _str + '.wav'
316
-
317
-
318
-
319
-
320
-
321
-
322
-
323
-
324
-
325
-
326
-
327
-
328
-
329
-
330
- # # =========================================================================== HARVRAD wav
331
- # with open('harvard.json', 'r') as f:
332
- # harvard_individual_sentences = json.load(f)['sentences']
333
- # total_audio_mimic3 = []
334
- # total_audio_stts2 = []
335
- # ix = 0
336
- # for list_of_10 in harvard_individual_sentences[:1]: # 77
337
- # text = ' '.join(list_of_10['sentences'])
338
- # # harvard.append(long_sentence.replace('.', ' '))
339
- # # for text in list_of_10['sentences']:
340
- # style_vec = msinference.compute_style(prompt_path)
341
- # print(ix, text)
342
- # ix += 1
343
-
344
-
345
- # x = msinference.inference(text,
346
- # style_vec,
347
- # alpha=0.3,
348
- # beta=0.7,
349
- # diffusion_steps=7,
350
- # embedding_scale=1)
351
-
352
- # total_audio_stts2.append(x)
353
-
354
- # # also synthesize mimic with the same sentence and voice
355
-
356
- # # MIMIC-3 = = = = = = = = = = = = = = BEGIN
357
-
358
- # rate = 1 # high speed sounds nice if used as speaker-reference audio for StyleTTS2
359
- # _ssml = (
360
- # '<speak>'
361
- # '<prosody volume=\'64\'>'
362
- # f'<prosody rate=\'{rate}\'>'
363
- # f'<voice name=\'{_voice}\'>'
364
- # '<s>'
365
- # f'{text}'
366
- # '</s>'
367
- # '</voice>'
368
- # '</prosody>'
369
- # '</prosody>'
370
- # '</speak>'
371
- # )
372
- # with open('_tmp_ssml.txt', 'w') as f:
373
- # f.write(_ssml)
374
-
375
-
376
- # # ps = subprocess.Popen(f'cat _tmp_ssml.txt | mimic3 --ssml > {reference_wav}', shell=True)
377
- # # ps.wait() # using ps to call mimic3 because samples dont have time to be written in stdout buffer
378
- # args = get_args()
379
- # args.ssml = True
380
- # args.text = [_ssml] #['aa', 'bb'] #txt
381
- # args.interactive = False
382
- # # args.output_naming = OutputNaming.TIME
383
-
384
- # state = CommandLineInterfaceState(args=args)
385
- # initialize_args(state)
386
- # initialize_tts(state)
387
- # # args.texts = [txt] #['aa', 'bb'] #txt
388
- # # state.stdout = '.' #None #'makeme.wav'
389
  # # state.output_dir = '.noopy'
390
  # # state.interactive = False
391
  # # state.output_naming = OutputNaming.TIME
@@ -609,10 +495,170 @@ for folder, list_voices in [
609
 
610
  # AUD I N T E R F A C E
611
  # file_interface = f'timeseries_{long_audio.replace("/", "")}.pkl'
 
 
 
612
  for engine in ['mimic3', 'styletts2']:
613
  harvard_of_voice = f'{_dir}{engine}__{_str}'
614
  if not os.path.exists(harvard_of_voice + '.pkl'):
615
- df_pred = interface.process_file(harvard_of_voice + '.wav')
616
- df_pred.to_pickle(harvard_of_voice + '.pkl')
617
  else:
618
- print(harvard_of_voice + '.pkl', 'FOUND')
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
21
  shutdown_tts,
22
  OutputNaming,
23
  process_line)
24
+ # import msinference
25
  import time
26
  import json
27
  import pandas as pd
 
79
  ]
80
 
81
 
82
+ config = transformers.Wav2Vec2Config() #finetuning_task='spef2feat_reg')
83
+ config.dev = torch.device('cuda:0')
84
+ config.dev2 = torch.device('cuda:0')
85
  def _softmax(x):
86
  '''x : (batch, num_class)'''
87
  x -= x.max(1, keepdims=True) # if all -400 then sum(exp(x)) = 0
 
114
  teacher_cat = AutoModelForAudioClassification.from_pretrained(
115
  '3loi/SER-Odyssey-Baseline-WavLM-Categorical-Attributes',
116
  trust_remote_code=True # fun definitions see 3loi/SER-.. repo
117
+ ).to(config.dev2).eval()
118
  teacher_cat.forward = types.MethodType(_infer, teacher_cat)
119
 
120
 
 
136
  #{0: 'Angry', 1: 'Sad', 2: 'Happy', 3: 'Surprise',
137
  #4: 'Fear', 5: 'Disgust', 6: 'Contempt', 7: 'Neutral'}
138
  #tensor([[0.0015, 0.3651, 0.0593, 0.0315, 0.0600, 0.0125, 0.0319, 0.4382]])
139
+ logits_cat = teacher_cat(torch.from_numpy(x).to(config.dev)).cpu().detach().numpy()
140
  # USE ALL CATEGORIES
141
  # --
142
  # logits_audioset = audioset_model(x, 16000)['logits_sounds']
 
162
  resample=True,
163
  verbose=True,
164
  )
165
+ # ================================== ====== END INTERFACE
166
 
167
 
168
 
 
272
 
273
 
274
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
275
  # # state.output_dir = '.noopy'
276
  # # state.interactive = False
277
  # # state.output_naming = OutputNaming.TIME
 
495
 
496
  # AUD I N T E R F A C E
497
  # file_interface = f'timeseries_{long_audio.replace("/", "")}.pkl'
498
+
499
+
500
+
501
  for engine in ['mimic3', 'styletts2']:
502
  harvard_of_voice = f'{_dir}{engine}__{_str}'
503
  if not os.path.exists(harvard_of_voice + '.pkl'):
504
+ df = interface.process_file(harvard_of_voice + '.wav')
505
+ df.to_pickle(harvard_of_voice + '.pkl')
506
  else:
507
+ # df = pd.read_pickle(harvard_of_voice + '.pkl')
508
+ print(harvard_of_voice + '.pkl', 'FOUND')
509
+
510
+
511
+
512
+
513
+
514
+ # Her we have pkls
515
+
516
+
517
+
518
+
519
+
520
+
521
+ # ===============================================================================
522
+ # V I S U A L S
523
+ #
524
+ # ===============================================================================
525
+
526
+ for folder, list_voices in [
527
+ ['foreign', foreign_voices],
528
+ ['english', english_voices],
529
+ ]:
530
+ print(folder, list_voices[:4], '\n\nVISUALIZING VOICES')
531
+ for _id, _voice in enumerate(list_voices[:4]):
532
+ _str = _voice.replace('/', '_').replace('#', '_').replace('_low', '')
533
+ _dir = folder + '_pkl/'
534
+ if 'cmu-arctic' in _str:
535
+ _str = _str.replace('cmu-arctic', 'cmu_arctic') #+ '.wav'
536
+
537
+
538
+ vis_df = {}
539
+ # LOAD PKL
540
+ for engine in ['mimic3', 'styletts2']:
541
+ harvard_of_voice = f'{_dir}{engine}__{_str}'
542
+ if not os.path.exists(harvard_of_voice + '.pkl'):
543
+ df = interface.process_file(harvard_of_voice + '.wav')
544
+ df.to_pickle(harvard_of_voice + '.pkl')
545
+ else:
546
+ df = pd.read_pickle(harvard_of_voice + '.pkl')
547
+ print(harvard_of_voice + '.pkl', 'FOUND')
548
+
549
+ vis_df[engine] = df
550
+ SHORT = min(len(vis_df['mimic3']), len(vis_df['styletts2']))
551
+ for k,v in vis_df.items():
552
+ p = v[:SHORT] # TRuncate extra segments - human is slower than mimic3
553
+
554
+ p.reset_index(inplace= True)
555
+ p.drop(columns=['file','start'], inplace=True)
556
+ p.set_index('end', inplace=True)
557
+ # p = p.filter(scene_classes) #['transport', 'indoor', 'outdoor'])
558
+ p.index = p.index.map(mapper = (lambda x: x.total_seconds()))
559
+ vis_df[k] = p
560
+
561
+
562
+ print(vis_df, '\n\n\n\n \n')
563
+ # ============ VISUAL ADV cats of styletts2 vs mimic3 same-voice
564
+
565
+
566
+
567
+ fig, ax = plt.subplots(nrows=10, ncols=2, figsize=(24, 24),
568
+ gridspec_kw={'hspace': 0, 'wspace': .04})
569
+
570
+
571
+ # ADV
572
+
573
+
574
+ time_stamp = vis_df['mimic3'].index.to_numpy()
575
+ for j, dim in enumerate(['arousal',
576
+ 'dominance',
577
+ 'valence']):
578
+
579
+ # MIMIC3
580
+
581
+ ax[j, 0].plot(time_stamp, vis_df['mimic3'][dim],
582
+ color=(0,104/255,139/255),
583
+ label='mean_1',
584
+ linewidth=2)
585
+ ax[j, 0].fill_between(time_stamp,
586
+
587
+ vis_df['mimic3'][dim],
588
+ vis_df['styletts2'][dim],
589
+
590
+ color=(.2,.2,.2),
591
+ alpha=0.244)
592
+ if j == 0:
593
+ ax[j, 0].legend(['StyleTTS2 style mimic3',
594
+ 'StyleTTS2 style crema-d'],
595
+ prop={'size': 10},
596
+ # loc='lower right'
597
+ )
598
+ ax[j, 0].set_ylabel(dim.lower(), color=(.4, .4, .4), fontsize=14)
599
+
600
+ # TICK
601
+ ax[j, 0].set_ylim([1e-7, .9999])
602
+ # ax[j, 0].set_yticks([.25, .5,.75])
603
+ # ax[j, 0].set_yticklabels(['0.25', '.5', '0.75'])
604
+ ax[j, 0].set_xticklabels(['' for _ in ax[j, 0].get_xticklabels()])
605
+ ax[j, 0].set_xlim([time_stamp[0], time_stamp[-1]])
606
+
607
+
608
+ ax[j, 0].grid()
609
+
610
+ # CATEGORIE
611
+
612
+
613
+
614
+
615
+
616
+ time_stamp = vis_df['styletts2'].index.to_numpy()
617
+ for j, dim in enumerate(['Angry',
618
+ 'Sad',
619
+ 'Happy',
620
+ 'Surprise',
621
+ 'Fear',
622
+ 'Disgust',
623
+ 'Contempt',
624
+ # 'Neutral'
625
+ ]): # ASaHSuFDCN
626
+ j = j + 3 # skip A/D/V suplt
627
+
628
+ # MIMIC3
629
+
630
+ ax[j, 0].plot(time_stamp, vis_df['mimic3'][dim],
631
+ color=(0,104/255,139/255),
632
+ label='mean_1',
633
+ linewidth=2)
634
+ ax[j, 0].fill_between(time_stamp,
635
+
636
+ vis_df['mimic3'][dim],
637
+ vis_df['styletts2'][dim],
638
+
639
+ color=(.2,.2,.2),
640
+ alpha=0.244)
641
+ # ax[j, 0].legend(['StyleTTS2 style mimic3',
642
+ # 'StyleTTS2 style crema-d'],
643
+ # prop={'size': 10},
644
+ # # loc='upper left'
645
+ # )
646
+
647
+
648
+ ax[j, 0].set_ylabel(dim.lower(), color=(.4, .4, .4), fontsize=14)
649
+
650
+ # TICKS
651
+ ax[j, 0].set_ylim([1e-7, .9999])
652
+ ax[j, 0].set_xlim([time_stamp[0], time_stamp[-1]])
653
+ ax[j, 0].set_xticklabels(['' for _ in ax[j, 0].get_xticklabels()])
654
+ ax[j, 0].set_xlabel('767 Harvard Sentences (seconds)', fontsize=16, color=(.4,.4,.4))
655
+
656
+
657
+ ax[j, 0].grid()
658
+
659
+
660
+
661
+ plt.savefig(f'bh_{_str}.png', bbox_inches='tight')
662
+ plt.close()
663
+
664
+ print('UNCOMMENT msinfereence')