Dionyssos commited on
Commit
318370a
1 Parent(s): 01d6809

720 sentences - plots shadow

Browse files
Files changed (1) hide show
  1. visualize_tts_plesantness.py +48 -38
visualize_tts_plesantness.py CHANGED
@@ -10,12 +10,14 @@
10
  # mimic3_770.wav
11
  # mimic3_speedup_770.wav
12
  FULL_WAV = [
13
- 'english_z.wav',
14
- 'english_4x_z.wav',
15
- 'human_z.wav',
16
- 'foreign_z.wav',
17
- 'foreign_4x_z.wav',
18
  ]
 
 
19
  import pandas as pd
20
  import os
21
 
@@ -231,8 +233,8 @@ for long_audio in FULL_WAV:
231
  process_func=process_function,
232
  # process_func_args={'outputs': 'logits_scene'},
233
  process_func_applies_sliding_window=False,
234
- win_dur=4.0,
235
- hop_dur=40.0,
236
  sampling_rate=16000,
237
  resample=True,
238
  verbose=True,
@@ -284,36 +286,40 @@ for lang in ['english',
284
  'foreign']:
285
 
286
 
287
- fig, ax = plt.subplots(nrows=8, ncols=2, figsize=(21, 24),
288
  gridspec_kw={'hspace': 0, 'wspace': .04})
289
 
290
 
291
 
292
 
293
- time_stamp = preds['human_z.wav'].index.to_numpy()
294
  for j, dim in enumerate(['arousal',
295
  'dominance',
296
  'valence']):
297
 
298
  # MIMIC3
299
 
300
- ax[j, 0].plot(time_stamp, preds[f'{lang}_z.wav'][dim],
301
  color=(0,104/255,139/255),
302
  label='mean_1',
303
  linewidth=2)
304
  ax[j, 0].fill_between(time_stamp,
305
 
306
- preds[f'{lang}_z.wav'][dim],
307
- preds['human_z.wav'][dim],
308
 
309
  color=(.2,.2,.2),
310
  alpha=0.244)
311
- if j == 0:
312
- ax[j, 0].legend([f'StyleTTS2 using {lang}',
313
- f'StyleTTS2 uising LibriSpeech'],
314
- prop={'size': 10},
 
 
 
 
315
  )
316
- ax[j, 0].set_ylabel(dim.lower(), color=(.4, .4, .4), fontsize=14)
317
 
318
  # TICK
319
  ax[j, 0].set_ylim([1e-7, .9999])
@@ -326,26 +332,30 @@ for lang in ['english',
326
  # MIMIC3 4x speed
327
 
328
 
329
- ax[j, 1].plot(time_stamp, preds[f'{lang}_4x_z.wav'][dim],
330
  color=(0,104/255,139/255),
331
  label='mean_1',
332
  linewidth=2)
333
  ax[j, 1].fill_between(time_stamp,
334
 
335
- preds[f'{lang}_4x_z.wav'][dim],
336
- preds['human_z.wav'][dim],
337
 
338
- color=(.2,.2,.2),
339
  alpha=0.244)
340
- if j == 0:
341
- ax[j, 1].legend([f'StyleTTS2 using {lang} 4x speed',
342
- f'StyleTTS2 using LibriSpeech'],
343
- prop={'size': 10},
 
 
 
 
344
  # loc='lower right'
345
  )
346
 
347
 
348
- ax[j, 1].set_xlabel('767 Harvard Sentences (seconds)')
349
 
350
 
351
 
@@ -366,7 +376,7 @@ for lang in ['english',
366
 
367
 
368
 
369
- time_stamp = preds['human_z.wav'].index.to_numpy()
370
  for j, dim in enumerate(['Angry',
371
  'Sad',
372
  'Happy',
@@ -380,14 +390,14 @@ for lang in ['english',
380
 
381
  # MIMIC3
382
 
383
- ax[j, 0].plot(time_stamp, preds[f'{lang}_z.wav'][dim],
384
  color=(0,104/255,139/255),
385
  label='mean_1',
386
  linewidth=2)
387
  ax[j, 0].fill_between(time_stamp,
388
 
389
- preds[f'{lang}_z.wav'][dim],
390
- preds['human_z.wav'][dim],
391
 
392
  color=(.2,.2,.2),
393
  alpha=0.244)
@@ -398,26 +408,26 @@ for lang in ['english',
398
  # )
399
 
400
 
401
- ax[j, 0].set_ylabel(dim.lower(), color=(.4, .4, .4), fontsize=14)
402
 
403
  # TICKS
404
  ax[j, 0].set_ylim([1e-7, .9999])
405
  ax[j, 0].set_xlim([time_stamp[0], time_stamp[-1]])
406
  ax[j, 0].set_xticklabels(['' for _ in ax[j, 0].get_xticklabels()])
407
- ax[j, 0].set_xlabel('767 Harvard Sentences (seconds)', fontsize=16, color=(.4,.4,.4))
408
 
409
 
410
  # MIMIC3 4x speed
411
 
412
 
413
- ax[j, 1].plot(time_stamp, preds[f'{lang}_4x_z.wav'][dim],
414
  color=(0,104/255,139/255),
415
  label='mean_1',
416
  linewidth=2)
417
  ax[j, 1].fill_between(time_stamp,
418
 
419
- preds[f'{lang}_4x_z.wav'][dim],
420
- preds['human_z.wav'][dim],
421
 
422
  color=(.2,.2,.2),
423
  alpha=0.244)
@@ -426,8 +436,8 @@ for lang in ['english',
426
  # prop={'size': 10},
427
  # # loc='upper left'
428
  # )
429
- ax[j, 1].set_xlabel('767 Harvard Sentences (seconds)', fontsize=16, color=(.4,.4,.4))
430
- ax[j, 1].set_ylim([1e-7, .999])
431
  # ax[j, 1].set_yticklabels(['' for _ in ax[j, 1].get_yticklabels()])
432
  ax[j, 1].set_xticklabels(['' for _ in ax[j, 1].get_xticklabels()])
433
  ax[j, 1].set_xlim([time_stamp[0], time_stamp[-1]])
@@ -442,6 +452,6 @@ for lang in ['english',
442
 
443
 
444
 
445
- plt.savefig(f'fig_{lang}_z.pdf', bbox_inches='tight')
446
  plt.close()
447
 
 
10
  # mimic3_770.wav
11
  # mimic3_speedup_770.wav
12
  FULL_WAV = [
13
+ 'english_hfullh.wav',
14
+ 'english_4x_hfullh.wav',
15
+ 'human_hfullh.wav',
16
+ 'foreign_hfullh.wav',
17
+ 'foreign_4x_hfullh.wav',
18
  ]
19
+ WIN = 40
20
+ HOP = 10
21
  import pandas as pd
22
  import os
23
 
 
233
  process_func=process_function,
234
  # process_func_args={'outputs': 'logits_scene'},
235
  process_func_applies_sliding_window=False,
236
+ win_dur=WIN,
237
+ hop_dur=HOP,
238
  sampling_rate=16000,
239
  resample=True,
240
  verbose=True,
 
286
  'foreign']:
287
 
288
 
289
+ fig, ax = plt.subplots(nrows=8, ncols=2, figsize=(24,20.7),
290
  gridspec_kw={'hspace': 0, 'wspace': .04})
291
 
292
 
293
 
294
 
295
+ time_stamp = preds['human_hfullh.wav'].index.to_numpy()
296
  for j, dim in enumerate(['arousal',
297
  'dominance',
298
  'valence']):
299
 
300
  # MIMIC3
301
 
302
+ ax[j, 0].plot(time_stamp, preds[f'{lang}_hfullh.wav'][dim],
303
  color=(0,104/255,139/255),
304
  label='mean_1',
305
  linewidth=2)
306
  ax[j, 0].fill_between(time_stamp,
307
 
308
+ 0*preds[f'{lang}_hfullh.wav'][dim],
309
+ preds['human_hfullh.wav'][dim],
310
 
311
  color=(.2,.2,.2),
312
  alpha=0.244)
313
+ if j == 0:
314
+ if lang == 'english':
315
+ desc = 'English'
316
+ else:
317
+ desc = 'Non-English'
318
+ ax[j, 0].legend([f'StyleTTS2 using Mimic-3 {desc}',
319
+ f'StyleTTS2 uising EmoDB'],
320
+ prop={'size': 14},
321
  )
322
+ ax[j, 0].set_ylabel(dim.lower(), color=(.4, .4, .4), fontsize=17)
323
 
324
  # TICK
325
  ax[j, 0].set_ylim([1e-7, .9999])
 
332
  # MIMIC3 4x speed
333
 
334
 
335
+ ax[j, 1].plot(time_stamp, preds[f'{lang}_4x_hfullh.wav'][dim],
336
  color=(0,104/255,139/255),
337
  label='mean_1',
338
  linewidth=2)
339
  ax[j, 1].fill_between(time_stamp,
340
 
341
+ 0 * preds[f'{lang}_4x_hfullh.wav'][dim],
342
+ preds['human_hfullh.wav'][dim],
343
 
344
+ color=(.2,.2,.2),
345
  alpha=0.244)
346
+ if j == 0:
347
+ if lang == 'english':
348
+ desc = 'English'
349
+ else:
350
+ desc = 'Non-English'
351
+ ax[j, 1].legend([f'StyleTTS2 using Mimic-3 {desc} 4x speed',
352
+ f'StyleTTS2 using EmoDB'],
353
+ prop={'size': 14},
354
  # loc='lower right'
355
  )
356
 
357
 
358
+ ax[j, 1].set_xlabel('720 Harvard Sentences')
359
 
360
 
361
 
 
376
 
377
 
378
 
379
+ time_stamp = preds['human_hfullh.wav'].index.to_numpy()
380
  for j, dim in enumerate(['Angry',
381
  'Sad',
382
  'Happy',
 
390
 
391
  # MIMIC3
392
 
393
+ ax[j, 0].plot(time_stamp, preds[f'{lang}_hfullh.wav'][dim],
394
  color=(0,104/255,139/255),
395
  label='mean_1',
396
  linewidth=2)
397
  ax[j, 0].fill_between(time_stamp,
398
 
399
+ 0*preds[f'{lang}_hfullh.wav'][dim],
400
+ preds['human_hfullh.wav'][dim],
401
 
402
  color=(.2,.2,.2),
403
  alpha=0.244)
 
408
  # )
409
 
410
 
411
+ ax[j, 0].set_ylabel(dim.lower(), color=(.4, .4, .4), fontsize=17)
412
 
413
  # TICKS
414
  ax[j, 0].set_ylim([1e-7, .9999])
415
  ax[j, 0].set_xlim([time_stamp[0], time_stamp[-1]])
416
  ax[j, 0].set_xticklabels(['' for _ in ax[j, 0].get_xticklabels()])
417
+ ax[j, 0].set_xlabel('720 Harvard Sentences', fontsize=17, color=(.2,.2,.2))
418
 
419
 
420
  # MIMIC3 4x speed
421
 
422
 
423
+ ax[j, 1].plot(time_stamp, preds[f'{lang}_4x_hfullh.wav'][dim],
424
  color=(0,104/255,139/255),
425
  label='mean_1',
426
  linewidth=2)
427
  ax[j, 1].fill_between(time_stamp,
428
 
429
+ 0*preds[f'{lang}_4x_hfullh.wav'][dim],
430
+ preds['human_hfullh.wav'][dim],
431
 
432
  color=(.2,.2,.2),
433
  alpha=0.244)
 
436
  # prop={'size': 10},
437
  # # loc='upper left'
438
  # )
439
+ ax[j, 1].set_xlabel('720 Harvard Sentences', fontsize=17, color=(.2,.2,.2))
440
+ ax[j, 1].set_ylim([1e-7, .9999])
441
  # ax[j, 1].set_yticklabels(['' for _ in ax[j, 1].get_yticklabels()])
442
  ax[j, 1].set_xticklabels(['' for _ in ax[j, 1].get_xticklabels()])
443
  ax[j, 1].set_xlim([time_stamp[0], time_stamp[-1]])
 
452
 
453
 
454
 
455
+ plt.savefig(f'fig_{lang}_{WIN=}_{HOP=}_fin0.pdf', bbox_inches='tight')
456
  plt.close()
457