visuals draft
Browse files- mimic3_make_harvard_sentences.py +170 -124
mimic3_make_harvard_sentences.py
CHANGED
@@ -21,7 +21,7 @@ from mimic3_tts.__main__ import (CommandLineInterfaceState,
|
|
21 |
shutdown_tts,
|
22 |
OutputNaming,
|
23 |
process_line)
|
24 |
-
import msinference
|
25 |
import time
|
26 |
import json
|
27 |
import pandas as pd
|
@@ -79,9 +79,9 @@ LABELS = [
|
|
79 |
]
|
80 |
|
81 |
|
82 |
-
|
83 |
-
|
84 |
-
|
85 |
def _softmax(x):
|
86 |
'''x : (batch, num_class)'''
|
87 |
x -= x.max(1, keepdims=True) # if all -400 then sum(exp(x)) = 0
|
@@ -114,7 +114,7 @@ def _infer(self, x):
|
|
114 |
teacher_cat = AutoModelForAudioClassification.from_pretrained(
|
115 |
'3loi/SER-Odyssey-Baseline-WavLM-Categorical-Attributes',
|
116 |
trust_remote_code=True # fun definitions see 3loi/SER-.. repo
|
117 |
-
).to(
|
118 |
teacher_cat.forward = types.MethodType(_infer, teacher_cat)
|
119 |
|
120 |
|
@@ -136,7 +136,7 @@ def process_function(x, sampling_rate, idx):
|
|
136 |
#{0: 'Angry', 1: 'Sad', 2: 'Happy', 3: 'Surprise',
|
137 |
#4: 'Fear', 5: 'Disgust', 6: 'Contempt', 7: 'Neutral'}
|
138 |
#tensor([[0.0015, 0.3651, 0.0593, 0.0315, 0.0600, 0.0125, 0.0319, 0.4382]])
|
139 |
-
logits_cat = teacher_cat(torch.from_numpy(x).to(
|
140 |
# USE ALL CATEGORIES
|
141 |
# --
|
142 |
# logits_audioset = audioset_model(x, 16000)['logits_sounds']
|
@@ -162,7 +162,7 @@ interface = audinterface.Feature(
|
|
162 |
resample=True,
|
163 |
verbose=True,
|
164 |
)
|
165 |
-
#
|
166 |
|
167 |
|
168 |
|
@@ -272,120 +272,6 @@ Path(foreign_dir).mkdir(parents=True, exist_ok=True)
|
|
272 |
|
273 |
|
274 |
|
275 |
-
# # synth 767
|
276 |
-
# for _id, _voice in enumerate(foreign_voices):
|
277 |
-
# _str = _voice.replace('/', '_').replace('#', '_').replace('_low', '')
|
278 |
-
# if 'cmu-arctic' in _str:
|
279 |
-
# _str = _str.replace('cmu-arctic', 'cmu_arctic') #+ '.wav'
|
280 |
-
|
281 |
-
# print('\n\n\n\nExecuting', _voice,'\n\n\n\n\n')
|
282 |
-
|
283 |
-
# if (
|
284 |
-
# not os.path.isfile(foreign_dir + 'mimic3__' + _str + '.wav') or
|
285 |
-
# not os.path.isfile(foreign_dir + 'styletts2__' + _str + '.wav')
|
286 |
-
# ):
|
287 |
-
|
288 |
-
# # Mimic3 GitHub Quota exceded:
|
289 |
-
# # https://github.com/MycroftAI/mimic3-voices
|
290 |
-
# # Above repo can exceed download quota of LFS
|
291 |
-
# # Copy mimic-voices from local copies
|
292 |
-
# # clone https://huggingface.co/mukowaty/mimic3-voices/tree/main/voices
|
293 |
-
# # copy to ~/
|
294 |
-
# #
|
295 |
-
# #
|
296 |
-
# home_voice_dir = f'/home/audeering.local/dkounadis/.local/share/mycroft/mimic3/voices/{_voice.split("#")[0]}/'
|
297 |
-
# Path(home_voice_dir).mkdir(parents=True, exist_ok=True)
|
298 |
-
# speaker_free_voice_name = _voice.split("#")[0] if '#' in _voice else _voice
|
299 |
-
|
300 |
-
|
301 |
-
# if (
|
302 |
-
# (not os.path.isfile(home_voice_dir + 'generator.onnx')) or
|
303 |
-
# (os.path.getsize(home_voice_dir + 'generator.onnx') < 500) # .onnx - is just LFS header
|
304 |
-
# ):
|
305 |
-
|
306 |
-
# # Copy
|
307 |
-
|
308 |
-
# shutil.copyfile(
|
309 |
-
# f'/data/dkounadis/mimic3-voices/voices/{speaker_free_voice_name}/generator.onnx',
|
310 |
-
# home_voice_dir + 'generator.onnx')
|
311 |
-
|
312 |
-
|
313 |
-
|
314 |
-
# # pre made
|
315 |
-
# prompt_path = 'mimic3_foreign_4x/' + _str + '.wav'
|
316 |
-
|
317 |
-
|
318 |
-
|
319 |
-
|
320 |
-
|
321 |
-
|
322 |
-
|
323 |
-
|
324 |
-
|
325 |
-
|
326 |
-
|
327 |
-
|
328 |
-
|
329 |
-
|
330 |
-
# # =========================================================================== HARVRAD wav
|
331 |
-
# with open('harvard.json', 'r') as f:
|
332 |
-
# harvard_individual_sentences = json.load(f)['sentences']
|
333 |
-
# total_audio_mimic3 = []
|
334 |
-
# total_audio_stts2 = []
|
335 |
-
# ix = 0
|
336 |
-
# for list_of_10 in harvard_individual_sentences[:1]: # 77
|
337 |
-
# text = ' '.join(list_of_10['sentences'])
|
338 |
-
# # harvard.append(long_sentence.replace('.', ' '))
|
339 |
-
# # for text in list_of_10['sentences']:
|
340 |
-
# style_vec = msinference.compute_style(prompt_path)
|
341 |
-
# print(ix, text)
|
342 |
-
# ix += 1
|
343 |
-
|
344 |
-
|
345 |
-
# x = msinference.inference(text,
|
346 |
-
# style_vec,
|
347 |
-
# alpha=0.3,
|
348 |
-
# beta=0.7,
|
349 |
-
# diffusion_steps=7,
|
350 |
-
# embedding_scale=1)
|
351 |
-
|
352 |
-
# total_audio_stts2.append(x)
|
353 |
-
|
354 |
-
# # also synthesize mimic with the same sentence and voice
|
355 |
-
|
356 |
-
# # MIMIC-3 = = = = = = = = = = = = = = BEGIN
|
357 |
-
|
358 |
-
# rate = 1 # high speed sounds nice if used as speaker-reference audio for StyleTTS2
|
359 |
-
# _ssml = (
|
360 |
-
# '<speak>'
|
361 |
-
# '<prosody volume=\'64\'>'
|
362 |
-
# f'<prosody rate=\'{rate}\'>'
|
363 |
-
# f'<voice name=\'{_voice}\'>'
|
364 |
-
# '<s>'
|
365 |
-
# f'{text}'
|
366 |
-
# '</s>'
|
367 |
-
# '</voice>'
|
368 |
-
# '</prosody>'
|
369 |
-
# '</prosody>'
|
370 |
-
# '</speak>'
|
371 |
-
# )
|
372 |
-
# with open('_tmp_ssml.txt', 'w') as f:
|
373 |
-
# f.write(_ssml)
|
374 |
-
|
375 |
-
|
376 |
-
# # ps = subprocess.Popen(f'cat _tmp_ssml.txt | mimic3 --ssml > {reference_wav}', shell=True)
|
377 |
-
# # ps.wait() # using ps to call mimic3 because samples dont have time to be written in stdout buffer
|
378 |
-
# args = get_args()
|
379 |
-
# args.ssml = True
|
380 |
-
# args.text = [_ssml] #['aa', 'bb'] #txt
|
381 |
-
# args.interactive = False
|
382 |
-
# # args.output_naming = OutputNaming.TIME
|
383 |
-
|
384 |
-
# state = CommandLineInterfaceState(args=args)
|
385 |
-
# initialize_args(state)
|
386 |
-
# initialize_tts(state)
|
387 |
-
# # args.texts = [txt] #['aa', 'bb'] #txt
|
388 |
-
# # state.stdout = '.' #None #'makeme.wav'
|
389 |
# # state.output_dir = '.noopy'
|
390 |
# # state.interactive = False
|
391 |
# # state.output_naming = OutputNaming.TIME
|
@@ -609,10 +495,170 @@ for folder, list_voices in [
|
|
609 |
|
610 |
# AUD I N T E R F A C E
|
611 |
# file_interface = f'timeseries_{long_audio.replace("/", "")}.pkl'
|
|
|
|
|
|
|
612 |
for engine in ['mimic3', 'styletts2']:
|
613 |
harvard_of_voice = f'{_dir}{engine}__{_str}'
|
614 |
if not os.path.exists(harvard_of_voice + '.pkl'):
|
615 |
-
|
616 |
-
|
617 |
else:
|
618 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
21 |
shutdown_tts,
|
22 |
OutputNaming,
|
23 |
process_line)
|
24 |
+
# import msinference
|
25 |
import time
|
26 |
import json
|
27 |
import pandas as pd
|
|
|
79 |
]
|
80 |
|
81 |
|
82 |
+
config = transformers.Wav2Vec2Config() #finetuning_task='spef2feat_reg')
|
83 |
+
config.dev = torch.device('cuda:0')
|
84 |
+
config.dev2 = torch.device('cuda:0')
|
85 |
def _softmax(x):
|
86 |
'''x : (batch, num_class)'''
|
87 |
x -= x.max(1, keepdims=True) # if all -400 then sum(exp(x)) = 0
|
|
|
114 |
teacher_cat = AutoModelForAudioClassification.from_pretrained(
|
115 |
'3loi/SER-Odyssey-Baseline-WavLM-Categorical-Attributes',
|
116 |
trust_remote_code=True # fun definitions see 3loi/SER-.. repo
|
117 |
+
).to(config.dev2).eval()
|
118 |
teacher_cat.forward = types.MethodType(_infer, teacher_cat)
|
119 |
|
120 |
|
|
|
136 |
#{0: 'Angry', 1: 'Sad', 2: 'Happy', 3: 'Surprise',
|
137 |
#4: 'Fear', 5: 'Disgust', 6: 'Contempt', 7: 'Neutral'}
|
138 |
#tensor([[0.0015, 0.3651, 0.0593, 0.0315, 0.0600, 0.0125, 0.0319, 0.4382]])
|
139 |
+
logits_cat = teacher_cat(torch.from_numpy(x).to(config.dev)).cpu().detach().numpy()
|
140 |
# USE ALL CATEGORIES
|
141 |
# --
|
142 |
# logits_audioset = audioset_model(x, 16000)['logits_sounds']
|
|
|
162 |
resample=True,
|
163 |
verbose=True,
|
164 |
)
|
165 |
+
# ================================== ====== END INTERFACE
|
166 |
|
167 |
|
168 |
|
|
|
272 |
|
273 |
|
274 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
275 |
# # state.output_dir = '.noopy'
|
276 |
# # state.interactive = False
|
277 |
# # state.output_naming = OutputNaming.TIME
|
|
|
495 |
|
496 |
# AUD I N T E R F A C E
|
497 |
# file_interface = f'timeseries_{long_audio.replace("/", "")}.pkl'
|
498 |
+
|
499 |
+
|
500 |
+
|
501 |
for engine in ['mimic3', 'styletts2']:
|
502 |
harvard_of_voice = f'{_dir}{engine}__{_str}'
|
503 |
if not os.path.exists(harvard_of_voice + '.pkl'):
|
504 |
+
df = interface.process_file(harvard_of_voice + '.wav')
|
505 |
+
df.to_pickle(harvard_of_voice + '.pkl')
|
506 |
else:
|
507 |
+
# df = pd.read_pickle(harvard_of_voice + '.pkl')
|
508 |
+
print(harvard_of_voice + '.pkl', 'FOUND')
|
509 |
+
|
510 |
+
|
511 |
+
|
512 |
+
|
513 |
+
|
514 |
+
# Her we have pkls
|
515 |
+
|
516 |
+
|
517 |
+
|
518 |
+
|
519 |
+
|
520 |
+
|
521 |
+
# ===============================================================================
|
522 |
+
# V I S U A L S
|
523 |
+
#
|
524 |
+
# ===============================================================================
|
525 |
+
|
526 |
+
for folder, list_voices in [
|
527 |
+
['foreign', foreign_voices],
|
528 |
+
['english', english_voices],
|
529 |
+
]:
|
530 |
+
print(folder, list_voices[:4], '\n\nVISUALIZING VOICES')
|
531 |
+
for _id, _voice in enumerate(list_voices[:4]):
|
532 |
+
_str = _voice.replace('/', '_').replace('#', '_').replace('_low', '')
|
533 |
+
_dir = folder + '_pkl/'
|
534 |
+
if 'cmu-arctic' in _str:
|
535 |
+
_str = _str.replace('cmu-arctic', 'cmu_arctic') #+ '.wav'
|
536 |
+
|
537 |
+
|
538 |
+
vis_df = {}
|
539 |
+
# LOAD PKL
|
540 |
+
for engine in ['mimic3', 'styletts2']:
|
541 |
+
harvard_of_voice = f'{_dir}{engine}__{_str}'
|
542 |
+
if not os.path.exists(harvard_of_voice + '.pkl'):
|
543 |
+
df = interface.process_file(harvard_of_voice + '.wav')
|
544 |
+
df.to_pickle(harvard_of_voice + '.pkl')
|
545 |
+
else:
|
546 |
+
df = pd.read_pickle(harvard_of_voice + '.pkl')
|
547 |
+
print(harvard_of_voice + '.pkl', 'FOUND')
|
548 |
+
|
549 |
+
vis_df[engine] = df
|
550 |
+
SHORT = min(len(vis_df['mimic3']), len(vis_df['styletts2']))
|
551 |
+
for k,v in vis_df.items():
|
552 |
+
p = v[:SHORT] # TRuncate extra segments - human is slower than mimic3
|
553 |
+
|
554 |
+
p.reset_index(inplace= True)
|
555 |
+
p.drop(columns=['file','start'], inplace=True)
|
556 |
+
p.set_index('end', inplace=True)
|
557 |
+
# p = p.filter(scene_classes) #['transport', 'indoor', 'outdoor'])
|
558 |
+
p.index = p.index.map(mapper = (lambda x: x.total_seconds()))
|
559 |
+
vis_df[k] = p
|
560 |
+
|
561 |
+
|
562 |
+
print(vis_df, '\n\n\n\n \n')
|
563 |
+
# ============ VISUAL ADV cats of styletts2 vs mimic3 same-voice
|
564 |
+
|
565 |
+
|
566 |
+
|
567 |
+
fig, ax = plt.subplots(nrows=10, ncols=2, figsize=(24, 24),
|
568 |
+
gridspec_kw={'hspace': 0, 'wspace': .04})
|
569 |
+
|
570 |
+
|
571 |
+
# ADV
|
572 |
+
|
573 |
+
|
574 |
+
time_stamp = vis_df['mimic3'].index.to_numpy()
|
575 |
+
for j, dim in enumerate(['arousal',
|
576 |
+
'dominance',
|
577 |
+
'valence']):
|
578 |
+
|
579 |
+
# MIMIC3
|
580 |
+
|
581 |
+
ax[j, 0].plot(time_stamp, vis_df['mimic3'][dim],
|
582 |
+
color=(0,104/255,139/255),
|
583 |
+
label='mean_1',
|
584 |
+
linewidth=2)
|
585 |
+
ax[j, 0].fill_between(time_stamp,
|
586 |
+
|
587 |
+
vis_df['mimic3'][dim],
|
588 |
+
vis_df['styletts2'][dim],
|
589 |
+
|
590 |
+
color=(.2,.2,.2),
|
591 |
+
alpha=0.244)
|
592 |
+
if j == 0:
|
593 |
+
ax[j, 0].legend(['StyleTTS2 style mimic3',
|
594 |
+
'StyleTTS2 style crema-d'],
|
595 |
+
prop={'size': 10},
|
596 |
+
# loc='lower right'
|
597 |
+
)
|
598 |
+
ax[j, 0].set_ylabel(dim.lower(), color=(.4, .4, .4), fontsize=14)
|
599 |
+
|
600 |
+
# TICK
|
601 |
+
ax[j, 0].set_ylim([1e-7, .9999])
|
602 |
+
# ax[j, 0].set_yticks([.25, .5,.75])
|
603 |
+
# ax[j, 0].set_yticklabels(['0.25', '.5', '0.75'])
|
604 |
+
ax[j, 0].set_xticklabels(['' for _ in ax[j, 0].get_xticklabels()])
|
605 |
+
ax[j, 0].set_xlim([time_stamp[0], time_stamp[-1]])
|
606 |
+
|
607 |
+
|
608 |
+
ax[j, 0].grid()
|
609 |
+
|
610 |
+
# CATEGORIE
|
611 |
+
|
612 |
+
|
613 |
+
|
614 |
+
|
615 |
+
|
616 |
+
time_stamp = vis_df['styletts2'].index.to_numpy()
|
617 |
+
for j, dim in enumerate(['Angry',
|
618 |
+
'Sad',
|
619 |
+
'Happy',
|
620 |
+
'Surprise',
|
621 |
+
'Fear',
|
622 |
+
'Disgust',
|
623 |
+
'Contempt',
|
624 |
+
# 'Neutral'
|
625 |
+
]): # ASaHSuFDCN
|
626 |
+
j = j + 3 # skip A/D/V suplt
|
627 |
+
|
628 |
+
# MIMIC3
|
629 |
+
|
630 |
+
ax[j, 0].plot(time_stamp, vis_df['mimic3'][dim],
|
631 |
+
color=(0,104/255,139/255),
|
632 |
+
label='mean_1',
|
633 |
+
linewidth=2)
|
634 |
+
ax[j, 0].fill_between(time_stamp,
|
635 |
+
|
636 |
+
vis_df['mimic3'][dim],
|
637 |
+
vis_df['styletts2'][dim],
|
638 |
+
|
639 |
+
color=(.2,.2,.2),
|
640 |
+
alpha=0.244)
|
641 |
+
# ax[j, 0].legend(['StyleTTS2 style mimic3',
|
642 |
+
# 'StyleTTS2 style crema-d'],
|
643 |
+
# prop={'size': 10},
|
644 |
+
# # loc='upper left'
|
645 |
+
# )
|
646 |
+
|
647 |
+
|
648 |
+
ax[j, 0].set_ylabel(dim.lower(), color=(.4, .4, .4), fontsize=14)
|
649 |
+
|
650 |
+
# TICKS
|
651 |
+
ax[j, 0].set_ylim([1e-7, .9999])
|
652 |
+
ax[j, 0].set_xlim([time_stamp[0], time_stamp[-1]])
|
653 |
+
ax[j, 0].set_xticklabels(['' for _ in ax[j, 0].get_xticklabels()])
|
654 |
+
ax[j, 0].set_xlabel('767 Harvard Sentences (seconds)', fontsize=16, color=(.4,.4,.4))
|
655 |
+
|
656 |
+
|
657 |
+
ax[j, 0].grid()
|
658 |
+
|
659 |
+
|
660 |
+
|
661 |
+
plt.savefig(f'bh_{_str}.png', bbox_inches='tight')
|
662 |
+
plt.close()
|
663 |
+
|
664 |
+
print('UNCOMMENT msinfereence')
|