Dionyssos commited on
Commit
bb8414f
1 Parent(s): fda2aa0

mimic3 vs styletts2 - both wav & pkl

Browse files
Files changed (1) hide show
  1. mimic3_make_harvard_sentences.py +618 -0
mimic3_make_harvard_sentences.py ADDED
@@ -0,0 +1,618 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # 1. Syntesize Harvard Sentences via Mimic-3 - 1 voice
2
+ # 1. Synthesize via StyleTTS2 --> use same or sweetdreams
3
+ # 2. Run audinterface on this 767
4
+ # 3. .mimic3_pkl .styletts2_pkl -> different durations
5
+
6
+ # It may crash due to non-truly-blocking shutil.copyfile() saying onnx protobuf incomplete file
7
+ # You have to rerun the script - it will copy all voices from hf:mimic3-voices to ~/.local/mimic3
8
+ import shutil
9
+ import csv
10
+ import io
11
+ import os
12
+ import typing
13
+ import wave
14
+ import sys
15
+ from mimic3_tts.__main__ import (CommandLineInterfaceState,
16
+ get_args,
17
+ initialize_args,
18
+ initialize_tts,
19
+ # print_voices,
20
+ # process_lines,
21
+ shutdown_tts,
22
+ OutputNaming,
23
+ process_line)
24
+ import msinference
25
+ import time
26
+ import json
27
+ import pandas as pd
28
+ import os
29
+ import numpy as np
30
+ import audonnx
31
+ import audb
32
+ from pathlib import Path
33
+ import transformers
34
+ import torch
35
+ import audmodel
36
+ import audinterface
37
+ import matplotlib.pyplot as plt
38
+ import audiofile
39
+
40
+
41
+ # ================================================ LIST OF VOICES
42
+ ROOT_DIR = '/data/dkounadis/mimic3-voices/'
43
+ foreign_voices = []
44
+ english_voices = []
45
+ for lang in os.listdir(ROOT_DIR + 'voices'):
46
+
47
+ for voice in os.listdir(ROOT_DIR + 'voices/' + lang):
48
+ if 'en_' in lang:
49
+
50
+ try:
51
+ with open(ROOT_DIR + 'voices/' + lang + '/' + voice + '/speakers.txt', 'r') as f:
52
+ for spk in f:
53
+ english_voices.append(lang + '/' + voice + '#' + spk.rstrip())
54
+ # voice_spk_string = lang + '/' + voice + '#' + spk.rstrip() for spk in f
55
+ except FileNotFoundError:
56
+ english_voices.append(lang + '/' + voice)
57
+
58
+ else:
59
+
60
+ try:
61
+ with open(ROOT_DIR + 'voices/' + lang + '/' + voice + '/speakers.txt', 'r') as f:
62
+ for spk in f:
63
+ foreign_voices.append(lang + '/' + voice + '#' + spk.rstrip())
64
+
65
+ except FileNotFoundError:
66
+ foreign_voices.append(lang + '/' + voice)
67
+ # ================================================== INTERFACE MODELS
68
+ LABELS = [
69
+ 'arousal', 'dominance', 'valence',
70
+ # 'speech_synthesizer', 'synthetic_singing',
71
+ 'Angry',
72
+ 'Sad',
73
+ 'Happy',
74
+ 'Surprise',
75
+ 'Fear',
76
+ 'Disgust',
77
+ 'Contempt',
78
+ 'Neutral'
79
+ ]
80
+
81
+
82
+ args = transformers.Wav2Vec2Config() #finetuning_task='spef2feat_reg')
83
+ args.dev = torch.device('cuda:0')
84
+ args.dev2 = torch.device('cuda:0')
85
+ def _softmax(x):
86
+ '''x : (batch, num_class)'''
87
+ x -= x.max(1, keepdims=True) # if all -400 then sum(exp(x)) = 0
88
+ x = np.maximum(-100, x)
89
+ x = np.exp(x)
90
+ x /= x.sum(1, keepdims=True)
91
+ return x
92
+
93
+
94
+ from transformers import AutoModelForAudioClassification
95
+ import types
96
+
97
+
98
+ def _infer(self, x):
99
+ '''x: (batch, audio-samples-16KHz)'''
100
+ x = (x + self.config.mean) / self.config.std # plus
101
+ x = self.ssl_model(x, attention_mask=None).last_hidden_state
102
+ # pool
103
+ h = self.pool_model.sap_linear(x).tanh()
104
+ w = torch.matmul(h, self.pool_model.attention)
105
+ w = w.softmax(1)
106
+ mu = (x * w).sum(1)
107
+ x = torch.cat(
108
+ [
109
+ mu,
110
+ ((x * x * w).sum(1) - mu * mu).clamp(min=1e-7).sqrt()
111
+ ], 1)
112
+ return self.ser_model(x)
113
+
114
+ teacher_cat = AutoModelForAudioClassification.from_pretrained(
115
+ '3loi/SER-Odyssey-Baseline-WavLM-Categorical-Attributes',
116
+ trust_remote_code=True # fun definitions see 3loi/SER-.. repo
117
+ ).to(args.dev2).eval()
118
+ teacher_cat.forward = types.MethodType(_infer, teacher_cat)
119
+
120
+
121
+
122
+ # Audioset & ADV
123
+
124
+ # audioset_model = audonnx.load(audmodel.load('17c240ec-1.0.0'), device='cuda:0')
125
+ adv_model = audonnx.load(audmodel.load('90398682-2.0.0'), device='cuda:0')
126
+
127
+ def process_function(x, sampling_rate, idx):
128
+ '''run audioset ct, adv
129
+
130
+ USE onnx teachers
131
+
132
+ return [synth-speech, synth-singing, 7x, 3x adv] = 11
133
+ '''
134
+
135
+ # x = x[None , :] ASaHSuFDCN
136
+ #{0: 'Angry', 1: 'Sad', 2: 'Happy', 3: 'Surprise',
137
+ #4: 'Fear', 5: 'Disgust', 6: 'Contempt', 7: 'Neutral'}
138
+ #tensor([[0.0015, 0.3651, 0.0593, 0.0315, 0.0600, 0.0125, 0.0319, 0.4382]])
139
+ logits_cat = teacher_cat(torch.from_numpy(x).to(args.dev)).cpu().detach().numpy()
140
+ # USE ALL CATEGORIES
141
+ # --
142
+ # logits_audioset = audioset_model(x, 16000)['logits_sounds']
143
+ # logits_audioset = logits_audioset[:, [7, 35]] # speech synthesizer synthetic singing
144
+ # --
145
+ logits_adv = adv_model(x, 16000)['logits']
146
+
147
+ cat = np.concatenate([logits_adv,
148
+ # _sigmoid(logits_audioset),
149
+ _softmax(logits_cat)],
150
+ 1)
151
+ print(cat)
152
+ return cat #logits_adv #model(signal, sampling_rate)['logits']
153
+
154
+ interface = audinterface.Feature(
155
+ feature_names=LABELS,
156
+ process_func=process_function,
157
+ # process_func_args={'outputs': 'logits_scene'},
158
+ process_func_applies_sliding_window=False,
159
+ win_dur=4.0,
160
+ hop_dur=1.0,
161
+ sampling_rate=16000,
162
+ resample=True,
163
+ verbose=True,
164
+ )
165
+ # ======================================== END INTERFACE
166
+
167
+
168
+
169
+
170
+
171
+
172
+
173
+
174
+
175
+
176
+
177
+
178
+
179
+
180
+
181
+
182
+
183
+
184
+
185
+
186
+
187
+ # Filter insufficient durations - prompt
188
+ foreign_voices = [i for i in foreign_voices if i not in ['bn/multi_low#02194',
189
+ 'uk_UK/m-ailabs_low#obruchov',
190
+ 'uk_UK/m-ailabs_low#shepel',
191
+ 'uk_UK/m-ailabs_low#loboda',
192
+ 'uk_UK/m-ailabs_low#miskun',
193
+ 'uk_UK/m-ailabs_low#sumska',
194
+ 'uk_UK/m-ailabs_low#pysariev',
195
+ ]]
196
+
197
+ # print(english_voices, '\n_________________________\n', foreign_voices)
198
+ # ----------------------
199
+ # print(foreign_voices.keys(), len(foreign_voices))
200
+ # raise SystemExit
201
+
202
+
203
+ def process_lines(state: CommandLineInterfaceState, wav_path=None):
204
+ '''MIMIC3 INTERNAL CALL that yields the sigh sound'''
205
+
206
+ args = state.args
207
+
208
+ result_idx = 0
209
+ print(f'why waitings in the for loop LIN {state.texts=}\n')
210
+ for line in state.texts:
211
+ # print(f'LIN {line=}\n') # prints \n so is empty not getting the predifne text of state.texts
212
+ line_voice: typing.Optional[str] = None
213
+ line_id = ""
214
+ line = line.strip()
215
+ # if not line:
216
+ # continue
217
+
218
+ if args.output_naming == OutputNaming.ID:
219
+ # Line has the format id|text instead of just text
220
+ with io.StringIO(line) as line_io:
221
+ reader = csv.reader(line_io, delimiter=args.csv_delimiter)
222
+ row = next(reader)
223
+ line_id, line = row[0], row[-1]
224
+ if args.csv_voice:
225
+ line_voice = row[1]
226
+
227
+ process_line(line, state, line_id=line_id, line_voice=line_voice)
228
+ result_idx += 1
229
+ time.sleep(4)
230
+ # Write combined audio to stdout
231
+ if state.all_audio:
232
+ # _LOGGER.debug("Writing WAV audio to stdout")
233
+
234
+ if sys.stdout.isatty() and (not state.args.stdout):
235
+ with io.BytesIO() as wav_io:
236
+ wav_file_play: wave.Wave_write = wave.open(wav_io, "wb")
237
+ with wav_file_play:
238
+ wav_file_play.setframerate(state.sample_rate_hz)
239
+ wav_file_play.setsampwidth(state.sample_width_bytes)
240
+ wav_file_play.setnchannels(state.num_channels)
241
+ wav_file_play.writeframes(state.all_audio)
242
+
243
+ # play_wav_bytes(state.args, wav_io.getvalue())
244
+ # wav_path = '_direct_call_2.wav'
245
+ with open(wav_path, 'wb') as wav_file:
246
+ wav_file.write(wav_io.getvalue())
247
+ wav_file.seek(0)
248
+ print('\n\n5T', wav_path)
249
+ else:
250
+ print('\n\nDOES NOT TTSING --> ADD SOME time.sleep(4)', wav_path)
251
+
252
+ # -----------------------------------------------------------------------------
253
+ # cat _tmp_ssml.txt | mimic3 --cuda --ssml --noise-w 0.90001 --length-scale 0.91 --noise-scale 0.04 > noise_w=0.90_en_happy_2.wav
254
+ # ======================================================================
255
+
256
+
257
+
258
+
259
+
260
+ # END DEF
261
+
262
+
263
+
264
+ # https://huggingface.co/dkounadis/artificial-styletts2/tree/main/mimic3_foreign
265
+
266
+ # STYLES Already Made - HF
267
+ english_dir = 'english_pkl/'
268
+ foreign_dir = 'foreign_pkl/'
269
+
270
+ Path(english_dir).mkdir(parents=True, exist_ok=True)
271
+ Path(foreign_dir).mkdir(parents=True, exist_ok=True)
272
+
273
+
274
+
275
+ # # synth 767
276
+ # for _id, _voice in enumerate(foreign_voices):
277
+ # _str = _voice.replace('/', '_').replace('#', '_').replace('_low', '')
278
+ # if 'cmu-arctic' in _str:
279
+ # _str = _str.replace('cmu-arctic', 'cmu_arctic') #+ '.wav'
280
+
281
+ # print('\n\n\n\nExecuting', _voice,'\n\n\n\n\n')
282
+
283
+ # if (
284
+ # not os.path.isfile(foreign_dir + 'mimic3__' + _str + '.wav') or
285
+ # not os.path.isfile(foreign_dir + 'styletts2__' + _str + '.wav')
286
+ # ):
287
+
288
+ # # Mimic3 GitHub Quota exceded:
289
+ # # https://github.com/MycroftAI/mimic3-voices
290
+ # # Above repo can exceed download quota of LFS
291
+ # # Copy mimic-voices from local copies
292
+ # # clone https://huggingface.co/mukowaty/mimic3-voices/tree/main/voices
293
+ # # copy to ~/
294
+ # #
295
+ # #
296
+ # home_voice_dir = f'/home/audeering.local/dkounadis/.local/share/mycroft/mimic3/voices/{_voice.split("#")[0]}/'
297
+ # Path(home_voice_dir).mkdir(parents=True, exist_ok=True)
298
+ # speaker_free_voice_name = _voice.split("#")[0] if '#' in _voice else _voice
299
+
300
+
301
+ # if (
302
+ # (not os.path.isfile(home_voice_dir + 'generator.onnx')) or
303
+ # (os.path.getsize(home_voice_dir + 'generator.onnx') < 500) # .onnx - is just LFS header
304
+ # ):
305
+
306
+ # # Copy
307
+
308
+ # shutil.copyfile(
309
+ # f'/data/dkounadis/mimic3-voices/voices/{speaker_free_voice_name}/generator.onnx',
310
+ # home_voice_dir + 'generator.onnx')
311
+
312
+
313
+
314
+ # # pre made
315
+ # prompt_path = 'mimic3_foreign_4x/' + _str + '.wav'
316
+
317
+
318
+
319
+
320
+
321
+
322
+
323
+
324
+
325
+
326
+
327
+
328
+
329
+
330
+ # # =========================================================================== HARVRAD wav
331
+ # with open('harvard.json', 'r') as f:
332
+ # harvard_individual_sentences = json.load(f)['sentences']
333
+ # total_audio_mimic3 = []
334
+ # total_audio_stts2 = []
335
+ # ix = 0
336
+ # for list_of_10 in harvard_individual_sentences[:1]: # 77
337
+ # text = ' '.join(list_of_10['sentences'])
338
+ # # harvard.append(long_sentence.replace('.', ' '))
339
+ # # for text in list_of_10['sentences']:
340
+ # style_vec = msinference.compute_style(prompt_path)
341
+ # print(ix, text)
342
+ # ix += 1
343
+
344
+
345
+ # x = msinference.inference(text,
346
+ # style_vec,
347
+ # alpha=0.3,
348
+ # beta=0.7,
349
+ # diffusion_steps=7,
350
+ # embedding_scale=1)
351
+
352
+ # total_audio_stts2.append(x)
353
+
354
+ # # also synthesize mimic with the same sentence and voice
355
+
356
+ # # MIMIC-3 = = = = = = = = = = = = = = BEGIN
357
+
358
+ # rate = 1 # high speed sounds nice if used as speaker-reference audio for StyleTTS2
359
+ # _ssml = (
360
+ # '<speak>'
361
+ # '<prosody volume=\'64\'>'
362
+ # f'<prosody rate=\'{rate}\'>'
363
+ # f'<voice name=\'{_voice}\'>'
364
+ # '<s>'
365
+ # f'{text}'
366
+ # '</s>'
367
+ # '</voice>'
368
+ # '</prosody>'
369
+ # '</prosody>'
370
+ # '</speak>'
371
+ # )
372
+ # with open('_tmp_ssml.txt', 'w') as f:
373
+ # f.write(_ssml)
374
+
375
+
376
+ # # ps = subprocess.Popen(f'cat _tmp_ssml.txt | mimic3 --ssml > {reference_wav}', shell=True)
377
+ # # ps.wait() # using ps to call mimic3 because samples dont have time to be written in stdout buffer
378
+ # args = get_args()
379
+ # args.ssml = True
380
+ # args.text = [_ssml] #['aa', 'bb'] #txt
381
+ # args.interactive = False
382
+ # # args.output_naming = OutputNaming.TIME
383
+
384
+ # state = CommandLineInterfaceState(args=args)
385
+ # initialize_args(state)
386
+ # initialize_tts(state)
387
+ # # args.texts = [txt] #['aa', 'bb'] #txt
388
+ # # state.stdout = '.' #None #'makeme.wav'
389
+ # # state.output_dir = '.noopy'
390
+ # # state.interactive = False
391
+ # # state.output_naming = OutputNaming.TIME
392
+ # # # state.ssml = 1234546575
393
+ # # state.stdout = True
394
+ # # state.tts = True
395
+ # process_lines(state, wav_path='tmp1.wav')
396
+ # shutdown_tts(state)
397
+ # x, fs = audiofile.read('tmp1.wav')
398
+ # total_audio_mimic3.append(x)
399
+ # print(fs, text, 'mimic3')
400
+
401
+ # # MIMIC3 = = = = = = = = = = = = = = END
402
+
403
+
404
+
405
+
406
+
407
+
408
+ # total_audio_stts2 = np.concatenate(total_audio_stts2) # -- concat 77x lists
409
+ # audiofile.write(foreign_dir + 'styletts2__' + _str + '.wav', total_audio_stts2, 24000)
410
+
411
+ # total_audio_mimic3 = np.concatenate(total_audio_mimic3) # -- concat 77x lists
412
+ # audiofile.write(foreign_dir + 'mimic3__' + _str + '.wav', total_audio_mimic3, 22050)
413
+
414
+ # print('Saving:', foreign_dir + 'mimic3__' + _str + '.wav')
415
+ # else:
416
+ # print('Skip:', foreign_dir + 'styletts2__' + _str + '.wav')
417
+
418
+
419
+
420
+
421
+
422
+
423
+
424
+
425
+
426
+
427
+
428
+
429
+
430
+
431
+
432
+
433
+
434
+
435
+
436
+
437
+
438
+
439
+
440
+
441
+
442
+
443
+
444
+
445
+
446
+
447
+
448
+
449
+
450
+
451
+
452
+
453
+
454
+
455
+
456
+
457
+ # load all harvard and for every voice -> load-its-style -> synth-mimic3 -> synth-stylett2 -> run-both-pkl
458
+ # FOREIGN
459
+ for folder, list_voices in [
460
+ ['foreign', foreign_voices],
461
+ ['english', english_voices],
462
+ ]:
463
+ print(folder, list_voices[:4], '\n\nEEEEEEEEEEEEEEEEEEEEEEEEEEEEEE')
464
+ for _id, _voice in enumerate(list_voices[:4]):
465
+ _str = _voice.replace('/', '_').replace('#', '_').replace('_low', '')
466
+ _dir = folder + '_pkl/'
467
+ if 'cmu-arctic' in _str:
468
+ _str = _str.replace('cmu-arctic', 'cmu_arctic') #+ '.wav'
469
+
470
+ print('\n\n\n\nExecuting', _voice,'\n\n\n\n\n')
471
+
472
+ if (
473
+ not os.path.isfile(_dir + 'mimic3__' + _str + '.wav') or
474
+ not os.path.isfile(_dir + 'styletts2__' + _str + '.wav')
475
+ ):
476
+
477
+ # Mimic3 GitHub Quota exceded:
478
+ # https://github.com/MycroftAI/mimic3-voices
479
+ # Above repo can exceed download quota of LFS
480
+ # Copy mimic-voices from local copies
481
+ # clone https://huggingface.co/mukowaty/mimic3-voices/tree/main/voices
482
+ # copy to ~/
483
+ #
484
+ #
485
+ home_voice_dir = f'/home/audeering.local/dkounadis/.local/share/mycroft/mimic3/voices/{_voice.split("#")[0]}/'
486
+ Path(home_voice_dir).mkdir(parents=True, exist_ok=True)
487
+
488
+
489
+ speaker_free_voice_name = _voice.split("#")[0] if '#' in _voice else _voice
490
+
491
+
492
+ if (
493
+ (not os.path.isfile(home_voice_dir + 'generator.onnx')) or
494
+ (os.path.getsize(home_voice_dir + 'generator.onnx') < 500) # .onnx - is just LFS header
495
+ ):
496
+
497
+ # Copy
498
+
499
+ shutil.copyfile(
500
+ f'/data/dkounadis/mimic3-voices/voices/{speaker_free_voice_name}/generator.onnx',
501
+ home_voice_dir + 'generator.onnx')
502
+
503
+
504
+
505
+ # pre made
506
+ prompt_path = f'mimic3_{folder}_4x/' + _str + '.wav'
507
+
508
+
509
+
510
+
511
+
512
+
513
+
514
+
515
+
516
+
517
+
518
+
519
+ # ACTUAL TTS
520
+
521
+
522
+ with open('harvard.json', 'r') as f:
523
+ harvard_individual_sentences = json.load(f)['sentences']
524
+ total_audio_mimic3 = []
525
+ total_audio_stts2 = []
526
+ ix = 0
527
+ for list_of_10 in harvard_individual_sentences[:1]: # 77
528
+ text = ' '.join(list_of_10['sentences'])
529
+ # harvard.append(long_sentence.replace('.', ' '))
530
+ # for text in list_of_10['sentences']:
531
+ style_vec = msinference.compute_style(prompt_path)
532
+ print(ix, text)
533
+ ix += 1
534
+
535
+
536
+ x = msinference.inference(text,
537
+ style_vec,
538
+ alpha=0.3,
539
+ beta=0.7,
540
+ diffusion_steps=7,
541
+ embedding_scale=1)
542
+
543
+ total_audio_stts2.append(x)
544
+
545
+ # also synthesize mimic with the same sentence and voice
546
+
547
+ # MIMIC-3 = = = = = = = = = = = = = = BEGIN
548
+
549
+ rate = 1 # high speed sounds nice if used as speaker-reference audio for StyleTTS2
550
+ _ssml = (
551
+ '<speak>'
552
+ '<prosody volume=\'64\'>'
553
+ f'<prosody rate=\'{rate}\'>'
554
+ f'<voice name=\'{_voice}\'>'
555
+ '<s>'
556
+ f'{text}'
557
+ '</s>'
558
+ '</voice>'
559
+ '</prosody>'
560
+ '</prosody>'
561
+ '</speak>'
562
+ )
563
+ with open('_tmp_ssml.txt', 'w') as f:
564
+ f.write(_ssml)
565
+
566
+
567
+ # ps = subprocess.Popen(f'cat _tmp_ssml.txt | mimic3 --ssml > {reference_wav}', shell=True)
568
+ # ps.wait() # using ps to call mimic3 because samples dont have time to be written in stdout buffer
569
+ args = get_args()
570
+ args.ssml = True
571
+ args.text = [_ssml] #['aa', 'bb'] #txt
572
+ args.interactive = False
573
+ # args.output_naming = OutputNaming.TIME
574
+
575
+ state = CommandLineInterfaceState(args=args)
576
+ initialize_args(state)
577
+ initialize_tts(state)
578
+ # args.texts = [txt] #['aa', 'bb'] #txt
579
+ # state.stdout = '.' #None #'makeme.wav'
580
+ # state.output_dir = '.noopy'
581
+ # state.interactive = False
582
+ # state.output_naming = OutputNaming.TIME
583
+ # # state.ssml = 1234546575
584
+ # state.stdout = True
585
+ # state.tts = True
586
+ process_lines(state, wav_path='tmp1.wav')
587
+ shutdown_tts(state)
588
+ x, fs = audiofile.read('tmp1.wav')
589
+ total_audio_mimic3.append(x)
590
+ print(fs, text, 'mimic3')
591
+
592
+ # MIMIC3 = = = = = = = = = = = = = = END
593
+
594
+
595
+
596
+
597
+
598
+
599
+ total_audio_stts2 = np.concatenate(total_audio_stts2) # -- concat 77x lists
600
+ audiofile.write(_dir + 'styletts2__' + _str + '.wav', total_audio_stts2, 24000)
601
+
602
+ total_audio_mimic3 = np.concatenate(total_audio_mimic3) # -- concat 77x lists
603
+ audiofile.write(_dir + 'mimic3__' + _str + '.wav', total_audio_mimic3, 22050)
604
+
605
+ print('Saving:', _dir + 'mimic3__' + _str + '.wav')
606
+ else:
607
+ print('Skip:', _dir + 'styletts2__' + _str + '.wav')
608
+
609
+
610
+ # AUD I N T E R F A C E
611
+ # file_interface = f'timeseries_{long_audio.replace("/", "")}.pkl'
612
+ for engine in ['mimic3', 'styletts2']:
613
+ harvard_of_voice = f'{_dir}{engine}__{_str}'
614
+ if not os.path.exists(harvard_of_voice + '.pkl'):
615
+ df_pred = interface.process_file(harvard_of_voice + '.wav')
616
+ df_pred.to_pickle(harvard_of_voice + '.pkl')
617
+ else:
618
+ print(harvard_of_voice + '.pkl', 'FOUND')