mimic3 vs styletts2 - both wav & pkl
Browse files- mimic3_make_harvard_sentences.py +618 -0
mimic3_make_harvard_sentences.py
ADDED
@@ -0,0 +1,618 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# 1. Syntesize Harvard Sentences via Mimic-3 - 1 voice
|
2 |
+
# 1. Synthesize via StyleTTS2 --> use same or sweetdreams
|
3 |
+
# 2. Run audinterface on this 767
|
4 |
+
# 3. .mimic3_pkl .styletts2_pkl -> different durations
|
5 |
+
|
6 |
+
# It may crash due to non-truly-blocking shutil.copyfile() saying onnx protobuf incomplete file
|
7 |
+
# You have to rerun the script - it will copy all voices from hf:mimic3-voices to ~/.local/mimic3
|
8 |
+
import shutil
|
9 |
+
import csv
|
10 |
+
import io
|
11 |
+
import os
|
12 |
+
import typing
|
13 |
+
import wave
|
14 |
+
import sys
|
15 |
+
from mimic3_tts.__main__ import (CommandLineInterfaceState,
|
16 |
+
get_args,
|
17 |
+
initialize_args,
|
18 |
+
initialize_tts,
|
19 |
+
# print_voices,
|
20 |
+
# process_lines,
|
21 |
+
shutdown_tts,
|
22 |
+
OutputNaming,
|
23 |
+
process_line)
|
24 |
+
import msinference
|
25 |
+
import time
|
26 |
+
import json
|
27 |
+
import pandas as pd
|
28 |
+
import os
|
29 |
+
import numpy as np
|
30 |
+
import audonnx
|
31 |
+
import audb
|
32 |
+
from pathlib import Path
|
33 |
+
import transformers
|
34 |
+
import torch
|
35 |
+
import audmodel
|
36 |
+
import audinterface
|
37 |
+
import matplotlib.pyplot as plt
|
38 |
+
import audiofile
|
39 |
+
|
40 |
+
|
41 |
+
# ================================================ LIST OF VOICES
|
42 |
+
ROOT_DIR = '/data/dkounadis/mimic3-voices/'
|
43 |
+
foreign_voices = []
|
44 |
+
english_voices = []
|
45 |
+
for lang in os.listdir(ROOT_DIR + 'voices'):
|
46 |
+
|
47 |
+
for voice in os.listdir(ROOT_DIR + 'voices/' + lang):
|
48 |
+
if 'en_' in lang:
|
49 |
+
|
50 |
+
try:
|
51 |
+
with open(ROOT_DIR + 'voices/' + lang + '/' + voice + '/speakers.txt', 'r') as f:
|
52 |
+
for spk in f:
|
53 |
+
english_voices.append(lang + '/' + voice + '#' + spk.rstrip())
|
54 |
+
# voice_spk_string = lang + '/' + voice + '#' + spk.rstrip() for spk in f
|
55 |
+
except FileNotFoundError:
|
56 |
+
english_voices.append(lang + '/' + voice)
|
57 |
+
|
58 |
+
else:
|
59 |
+
|
60 |
+
try:
|
61 |
+
with open(ROOT_DIR + 'voices/' + lang + '/' + voice + '/speakers.txt', 'r') as f:
|
62 |
+
for spk in f:
|
63 |
+
foreign_voices.append(lang + '/' + voice + '#' + spk.rstrip())
|
64 |
+
|
65 |
+
except FileNotFoundError:
|
66 |
+
foreign_voices.append(lang + '/' + voice)
|
67 |
+
# ================================================== INTERFACE MODELS
|
68 |
+
LABELS = [
|
69 |
+
'arousal', 'dominance', 'valence',
|
70 |
+
# 'speech_synthesizer', 'synthetic_singing',
|
71 |
+
'Angry',
|
72 |
+
'Sad',
|
73 |
+
'Happy',
|
74 |
+
'Surprise',
|
75 |
+
'Fear',
|
76 |
+
'Disgust',
|
77 |
+
'Contempt',
|
78 |
+
'Neutral'
|
79 |
+
]
|
80 |
+
|
81 |
+
|
82 |
+
args = transformers.Wav2Vec2Config() #finetuning_task='spef2feat_reg')
|
83 |
+
args.dev = torch.device('cuda:0')
|
84 |
+
args.dev2 = torch.device('cuda:0')
|
85 |
+
def _softmax(x):
|
86 |
+
'''x : (batch, num_class)'''
|
87 |
+
x -= x.max(1, keepdims=True) # if all -400 then sum(exp(x)) = 0
|
88 |
+
x = np.maximum(-100, x)
|
89 |
+
x = np.exp(x)
|
90 |
+
x /= x.sum(1, keepdims=True)
|
91 |
+
return x
|
92 |
+
|
93 |
+
|
94 |
+
from transformers import AutoModelForAudioClassification
|
95 |
+
import types
|
96 |
+
|
97 |
+
|
98 |
+
def _infer(self, x):
|
99 |
+
'''x: (batch, audio-samples-16KHz)'''
|
100 |
+
x = (x + self.config.mean) / self.config.std # plus
|
101 |
+
x = self.ssl_model(x, attention_mask=None).last_hidden_state
|
102 |
+
# pool
|
103 |
+
h = self.pool_model.sap_linear(x).tanh()
|
104 |
+
w = torch.matmul(h, self.pool_model.attention)
|
105 |
+
w = w.softmax(1)
|
106 |
+
mu = (x * w).sum(1)
|
107 |
+
x = torch.cat(
|
108 |
+
[
|
109 |
+
mu,
|
110 |
+
((x * x * w).sum(1) - mu * mu).clamp(min=1e-7).sqrt()
|
111 |
+
], 1)
|
112 |
+
return self.ser_model(x)
|
113 |
+
|
114 |
+
teacher_cat = AutoModelForAudioClassification.from_pretrained(
|
115 |
+
'3loi/SER-Odyssey-Baseline-WavLM-Categorical-Attributes',
|
116 |
+
trust_remote_code=True # fun definitions see 3loi/SER-.. repo
|
117 |
+
).to(args.dev2).eval()
|
118 |
+
teacher_cat.forward = types.MethodType(_infer, teacher_cat)
|
119 |
+
|
120 |
+
|
121 |
+
|
122 |
+
# Audioset & ADV
|
123 |
+
|
124 |
+
# audioset_model = audonnx.load(audmodel.load('17c240ec-1.0.0'), device='cuda:0')
|
125 |
+
adv_model = audonnx.load(audmodel.load('90398682-2.0.0'), device='cuda:0')
|
126 |
+
|
127 |
+
def process_function(x, sampling_rate, idx):
|
128 |
+
'''run audioset ct, adv
|
129 |
+
|
130 |
+
USE onnx teachers
|
131 |
+
|
132 |
+
return [synth-speech, synth-singing, 7x, 3x adv] = 11
|
133 |
+
'''
|
134 |
+
|
135 |
+
# x = x[None , :] ASaHSuFDCN
|
136 |
+
#{0: 'Angry', 1: 'Sad', 2: 'Happy', 3: 'Surprise',
|
137 |
+
#4: 'Fear', 5: 'Disgust', 6: 'Contempt', 7: 'Neutral'}
|
138 |
+
#tensor([[0.0015, 0.3651, 0.0593, 0.0315, 0.0600, 0.0125, 0.0319, 0.4382]])
|
139 |
+
logits_cat = teacher_cat(torch.from_numpy(x).to(args.dev)).cpu().detach().numpy()
|
140 |
+
# USE ALL CATEGORIES
|
141 |
+
# --
|
142 |
+
# logits_audioset = audioset_model(x, 16000)['logits_sounds']
|
143 |
+
# logits_audioset = logits_audioset[:, [7, 35]] # speech synthesizer synthetic singing
|
144 |
+
# --
|
145 |
+
logits_adv = adv_model(x, 16000)['logits']
|
146 |
+
|
147 |
+
cat = np.concatenate([logits_adv,
|
148 |
+
# _sigmoid(logits_audioset),
|
149 |
+
_softmax(logits_cat)],
|
150 |
+
1)
|
151 |
+
print(cat)
|
152 |
+
return cat #logits_adv #model(signal, sampling_rate)['logits']
|
153 |
+
|
154 |
+
interface = audinterface.Feature(
|
155 |
+
feature_names=LABELS,
|
156 |
+
process_func=process_function,
|
157 |
+
# process_func_args={'outputs': 'logits_scene'},
|
158 |
+
process_func_applies_sliding_window=False,
|
159 |
+
win_dur=4.0,
|
160 |
+
hop_dur=1.0,
|
161 |
+
sampling_rate=16000,
|
162 |
+
resample=True,
|
163 |
+
verbose=True,
|
164 |
+
)
|
165 |
+
# ======================================== END INTERFACE
|
166 |
+
|
167 |
+
|
168 |
+
|
169 |
+
|
170 |
+
|
171 |
+
|
172 |
+
|
173 |
+
|
174 |
+
|
175 |
+
|
176 |
+
|
177 |
+
|
178 |
+
|
179 |
+
|
180 |
+
|
181 |
+
|
182 |
+
|
183 |
+
|
184 |
+
|
185 |
+
|
186 |
+
|
187 |
+
# Filter insufficient durations - prompt
|
188 |
+
foreign_voices = [i for i in foreign_voices if i not in ['bn/multi_low#02194',
|
189 |
+
'uk_UK/m-ailabs_low#obruchov',
|
190 |
+
'uk_UK/m-ailabs_low#shepel',
|
191 |
+
'uk_UK/m-ailabs_low#loboda',
|
192 |
+
'uk_UK/m-ailabs_low#miskun',
|
193 |
+
'uk_UK/m-ailabs_low#sumska',
|
194 |
+
'uk_UK/m-ailabs_low#pysariev',
|
195 |
+
]]
|
196 |
+
|
197 |
+
# print(english_voices, '\n_________________________\n', foreign_voices)
|
198 |
+
# ----------------------
|
199 |
+
# print(foreign_voices.keys(), len(foreign_voices))
|
200 |
+
# raise SystemExit
|
201 |
+
|
202 |
+
|
203 |
+
def process_lines(state: CommandLineInterfaceState, wav_path=None):
|
204 |
+
'''MIMIC3 INTERNAL CALL that yields the sigh sound'''
|
205 |
+
|
206 |
+
args = state.args
|
207 |
+
|
208 |
+
result_idx = 0
|
209 |
+
print(f'why waitings in the for loop LIN {state.texts=}\n')
|
210 |
+
for line in state.texts:
|
211 |
+
# print(f'LIN {line=}\n') # prints \n so is empty not getting the predifne text of state.texts
|
212 |
+
line_voice: typing.Optional[str] = None
|
213 |
+
line_id = ""
|
214 |
+
line = line.strip()
|
215 |
+
# if not line:
|
216 |
+
# continue
|
217 |
+
|
218 |
+
if args.output_naming == OutputNaming.ID:
|
219 |
+
# Line has the format id|text instead of just text
|
220 |
+
with io.StringIO(line) as line_io:
|
221 |
+
reader = csv.reader(line_io, delimiter=args.csv_delimiter)
|
222 |
+
row = next(reader)
|
223 |
+
line_id, line = row[0], row[-1]
|
224 |
+
if args.csv_voice:
|
225 |
+
line_voice = row[1]
|
226 |
+
|
227 |
+
process_line(line, state, line_id=line_id, line_voice=line_voice)
|
228 |
+
result_idx += 1
|
229 |
+
time.sleep(4)
|
230 |
+
# Write combined audio to stdout
|
231 |
+
if state.all_audio:
|
232 |
+
# _LOGGER.debug("Writing WAV audio to stdout")
|
233 |
+
|
234 |
+
if sys.stdout.isatty() and (not state.args.stdout):
|
235 |
+
with io.BytesIO() as wav_io:
|
236 |
+
wav_file_play: wave.Wave_write = wave.open(wav_io, "wb")
|
237 |
+
with wav_file_play:
|
238 |
+
wav_file_play.setframerate(state.sample_rate_hz)
|
239 |
+
wav_file_play.setsampwidth(state.sample_width_bytes)
|
240 |
+
wav_file_play.setnchannels(state.num_channels)
|
241 |
+
wav_file_play.writeframes(state.all_audio)
|
242 |
+
|
243 |
+
# play_wav_bytes(state.args, wav_io.getvalue())
|
244 |
+
# wav_path = '_direct_call_2.wav'
|
245 |
+
with open(wav_path, 'wb') as wav_file:
|
246 |
+
wav_file.write(wav_io.getvalue())
|
247 |
+
wav_file.seek(0)
|
248 |
+
print('\n\n5T', wav_path)
|
249 |
+
else:
|
250 |
+
print('\n\nDOES NOT TTSING --> ADD SOME time.sleep(4)', wav_path)
|
251 |
+
|
252 |
+
# -----------------------------------------------------------------------------
|
253 |
+
# cat _tmp_ssml.txt | mimic3 --cuda --ssml --noise-w 0.90001 --length-scale 0.91 --noise-scale 0.04 > noise_w=0.90_en_happy_2.wav
|
254 |
+
# ======================================================================
|
255 |
+
|
256 |
+
|
257 |
+
|
258 |
+
|
259 |
+
|
260 |
+
# END DEF
|
261 |
+
|
262 |
+
|
263 |
+
|
264 |
+
# https://huggingface.co/dkounadis/artificial-styletts2/tree/main/mimic3_foreign
|
265 |
+
|
266 |
+
# STYLES Already Made - HF
|
267 |
+
english_dir = 'english_pkl/'
|
268 |
+
foreign_dir = 'foreign_pkl/'
|
269 |
+
|
270 |
+
Path(english_dir).mkdir(parents=True, exist_ok=True)
|
271 |
+
Path(foreign_dir).mkdir(parents=True, exist_ok=True)
|
272 |
+
|
273 |
+
|
274 |
+
|
275 |
+
# # synth 767
|
276 |
+
# for _id, _voice in enumerate(foreign_voices):
|
277 |
+
# _str = _voice.replace('/', '_').replace('#', '_').replace('_low', '')
|
278 |
+
# if 'cmu-arctic' in _str:
|
279 |
+
# _str = _str.replace('cmu-arctic', 'cmu_arctic') #+ '.wav'
|
280 |
+
|
281 |
+
# print('\n\n\n\nExecuting', _voice,'\n\n\n\n\n')
|
282 |
+
|
283 |
+
# if (
|
284 |
+
# not os.path.isfile(foreign_dir + 'mimic3__' + _str + '.wav') or
|
285 |
+
# not os.path.isfile(foreign_dir + 'styletts2__' + _str + '.wav')
|
286 |
+
# ):
|
287 |
+
|
288 |
+
# # Mimic3 GitHub Quota exceded:
|
289 |
+
# # https://github.com/MycroftAI/mimic3-voices
|
290 |
+
# # Above repo can exceed download quota of LFS
|
291 |
+
# # Copy mimic-voices from local copies
|
292 |
+
# # clone https://huggingface.co/mukowaty/mimic3-voices/tree/main/voices
|
293 |
+
# # copy to ~/
|
294 |
+
# #
|
295 |
+
# #
|
296 |
+
# home_voice_dir = f'/home/audeering.local/dkounadis/.local/share/mycroft/mimic3/voices/{_voice.split("#")[0]}/'
|
297 |
+
# Path(home_voice_dir).mkdir(parents=True, exist_ok=True)
|
298 |
+
# speaker_free_voice_name = _voice.split("#")[0] if '#' in _voice else _voice
|
299 |
+
|
300 |
+
|
301 |
+
# if (
|
302 |
+
# (not os.path.isfile(home_voice_dir + 'generator.onnx')) or
|
303 |
+
# (os.path.getsize(home_voice_dir + 'generator.onnx') < 500) # .onnx - is just LFS header
|
304 |
+
# ):
|
305 |
+
|
306 |
+
# # Copy
|
307 |
+
|
308 |
+
# shutil.copyfile(
|
309 |
+
# f'/data/dkounadis/mimic3-voices/voices/{speaker_free_voice_name}/generator.onnx',
|
310 |
+
# home_voice_dir + 'generator.onnx')
|
311 |
+
|
312 |
+
|
313 |
+
|
314 |
+
# # pre made
|
315 |
+
# prompt_path = 'mimic3_foreign_4x/' + _str + '.wav'
|
316 |
+
|
317 |
+
|
318 |
+
|
319 |
+
|
320 |
+
|
321 |
+
|
322 |
+
|
323 |
+
|
324 |
+
|
325 |
+
|
326 |
+
|
327 |
+
|
328 |
+
|
329 |
+
|
330 |
+
# # =========================================================================== HARVRAD wav
|
331 |
+
# with open('harvard.json', 'r') as f:
|
332 |
+
# harvard_individual_sentences = json.load(f)['sentences']
|
333 |
+
# total_audio_mimic3 = []
|
334 |
+
# total_audio_stts2 = []
|
335 |
+
# ix = 0
|
336 |
+
# for list_of_10 in harvard_individual_sentences[:1]: # 77
|
337 |
+
# text = ' '.join(list_of_10['sentences'])
|
338 |
+
# # harvard.append(long_sentence.replace('.', ' '))
|
339 |
+
# # for text in list_of_10['sentences']:
|
340 |
+
# style_vec = msinference.compute_style(prompt_path)
|
341 |
+
# print(ix, text)
|
342 |
+
# ix += 1
|
343 |
+
|
344 |
+
|
345 |
+
# x = msinference.inference(text,
|
346 |
+
# style_vec,
|
347 |
+
# alpha=0.3,
|
348 |
+
# beta=0.7,
|
349 |
+
# diffusion_steps=7,
|
350 |
+
# embedding_scale=1)
|
351 |
+
|
352 |
+
# total_audio_stts2.append(x)
|
353 |
+
|
354 |
+
# # also synthesize mimic with the same sentence and voice
|
355 |
+
|
356 |
+
# # MIMIC-3 = = = = = = = = = = = = = = BEGIN
|
357 |
+
|
358 |
+
# rate = 1 # high speed sounds nice if used as speaker-reference audio for StyleTTS2
|
359 |
+
# _ssml = (
|
360 |
+
# '<speak>'
|
361 |
+
# '<prosody volume=\'64\'>'
|
362 |
+
# f'<prosody rate=\'{rate}\'>'
|
363 |
+
# f'<voice name=\'{_voice}\'>'
|
364 |
+
# '<s>'
|
365 |
+
# f'{text}'
|
366 |
+
# '</s>'
|
367 |
+
# '</voice>'
|
368 |
+
# '</prosody>'
|
369 |
+
# '</prosody>'
|
370 |
+
# '</speak>'
|
371 |
+
# )
|
372 |
+
# with open('_tmp_ssml.txt', 'w') as f:
|
373 |
+
# f.write(_ssml)
|
374 |
+
|
375 |
+
|
376 |
+
# # ps = subprocess.Popen(f'cat _tmp_ssml.txt | mimic3 --ssml > {reference_wav}', shell=True)
|
377 |
+
# # ps.wait() # using ps to call mimic3 because samples dont have time to be written in stdout buffer
|
378 |
+
# args = get_args()
|
379 |
+
# args.ssml = True
|
380 |
+
# args.text = [_ssml] #['aa', 'bb'] #txt
|
381 |
+
# args.interactive = False
|
382 |
+
# # args.output_naming = OutputNaming.TIME
|
383 |
+
|
384 |
+
# state = CommandLineInterfaceState(args=args)
|
385 |
+
# initialize_args(state)
|
386 |
+
# initialize_tts(state)
|
387 |
+
# # args.texts = [txt] #['aa', 'bb'] #txt
|
388 |
+
# # state.stdout = '.' #None #'makeme.wav'
|
389 |
+
# # state.output_dir = '.noopy'
|
390 |
+
# # state.interactive = False
|
391 |
+
# # state.output_naming = OutputNaming.TIME
|
392 |
+
# # # state.ssml = 1234546575
|
393 |
+
# # state.stdout = True
|
394 |
+
# # state.tts = True
|
395 |
+
# process_lines(state, wav_path='tmp1.wav')
|
396 |
+
# shutdown_tts(state)
|
397 |
+
# x, fs = audiofile.read('tmp1.wav')
|
398 |
+
# total_audio_mimic3.append(x)
|
399 |
+
# print(fs, text, 'mimic3')
|
400 |
+
|
401 |
+
# # MIMIC3 = = = = = = = = = = = = = = END
|
402 |
+
|
403 |
+
|
404 |
+
|
405 |
+
|
406 |
+
|
407 |
+
|
408 |
+
# total_audio_stts2 = np.concatenate(total_audio_stts2) # -- concat 77x lists
|
409 |
+
# audiofile.write(foreign_dir + 'styletts2__' + _str + '.wav', total_audio_stts2, 24000)
|
410 |
+
|
411 |
+
# total_audio_mimic3 = np.concatenate(total_audio_mimic3) # -- concat 77x lists
|
412 |
+
# audiofile.write(foreign_dir + 'mimic3__' + _str + '.wav', total_audio_mimic3, 22050)
|
413 |
+
|
414 |
+
# print('Saving:', foreign_dir + 'mimic3__' + _str + '.wav')
|
415 |
+
# else:
|
416 |
+
# print('Skip:', foreign_dir + 'styletts2__' + _str + '.wav')
|
417 |
+
|
418 |
+
|
419 |
+
|
420 |
+
|
421 |
+
|
422 |
+
|
423 |
+
|
424 |
+
|
425 |
+
|
426 |
+
|
427 |
+
|
428 |
+
|
429 |
+
|
430 |
+
|
431 |
+
|
432 |
+
|
433 |
+
|
434 |
+
|
435 |
+
|
436 |
+
|
437 |
+
|
438 |
+
|
439 |
+
|
440 |
+
|
441 |
+
|
442 |
+
|
443 |
+
|
444 |
+
|
445 |
+
|
446 |
+
|
447 |
+
|
448 |
+
|
449 |
+
|
450 |
+
|
451 |
+
|
452 |
+
|
453 |
+
|
454 |
+
|
455 |
+
|
456 |
+
|
457 |
+
# load all harvard and for every voice -> load-its-style -> synth-mimic3 -> synth-stylett2 -> run-both-pkl
|
458 |
+
# FOREIGN
|
459 |
+
for folder, list_voices in [
|
460 |
+
['foreign', foreign_voices],
|
461 |
+
['english', english_voices],
|
462 |
+
]:
|
463 |
+
print(folder, list_voices[:4], '\n\nEEEEEEEEEEEEEEEEEEEEEEEEEEEEEE')
|
464 |
+
for _id, _voice in enumerate(list_voices[:4]):
|
465 |
+
_str = _voice.replace('/', '_').replace('#', '_').replace('_low', '')
|
466 |
+
_dir = folder + '_pkl/'
|
467 |
+
if 'cmu-arctic' in _str:
|
468 |
+
_str = _str.replace('cmu-arctic', 'cmu_arctic') #+ '.wav'
|
469 |
+
|
470 |
+
print('\n\n\n\nExecuting', _voice,'\n\n\n\n\n')
|
471 |
+
|
472 |
+
if (
|
473 |
+
not os.path.isfile(_dir + 'mimic3__' + _str + '.wav') or
|
474 |
+
not os.path.isfile(_dir + 'styletts2__' + _str + '.wav')
|
475 |
+
):
|
476 |
+
|
477 |
+
# Mimic3 GitHub Quota exceded:
|
478 |
+
# https://github.com/MycroftAI/mimic3-voices
|
479 |
+
# Above repo can exceed download quota of LFS
|
480 |
+
# Copy mimic-voices from local copies
|
481 |
+
# clone https://huggingface.co/mukowaty/mimic3-voices/tree/main/voices
|
482 |
+
# copy to ~/
|
483 |
+
#
|
484 |
+
#
|
485 |
+
home_voice_dir = f'/home/audeering.local/dkounadis/.local/share/mycroft/mimic3/voices/{_voice.split("#")[0]}/'
|
486 |
+
Path(home_voice_dir).mkdir(parents=True, exist_ok=True)
|
487 |
+
|
488 |
+
|
489 |
+
speaker_free_voice_name = _voice.split("#")[0] if '#' in _voice else _voice
|
490 |
+
|
491 |
+
|
492 |
+
if (
|
493 |
+
(not os.path.isfile(home_voice_dir + 'generator.onnx')) or
|
494 |
+
(os.path.getsize(home_voice_dir + 'generator.onnx') < 500) # .onnx - is just LFS header
|
495 |
+
):
|
496 |
+
|
497 |
+
# Copy
|
498 |
+
|
499 |
+
shutil.copyfile(
|
500 |
+
f'/data/dkounadis/mimic3-voices/voices/{speaker_free_voice_name}/generator.onnx',
|
501 |
+
home_voice_dir + 'generator.onnx')
|
502 |
+
|
503 |
+
|
504 |
+
|
505 |
+
# pre made
|
506 |
+
prompt_path = f'mimic3_{folder}_4x/' + _str + '.wav'
|
507 |
+
|
508 |
+
|
509 |
+
|
510 |
+
|
511 |
+
|
512 |
+
|
513 |
+
|
514 |
+
|
515 |
+
|
516 |
+
|
517 |
+
|
518 |
+
|
519 |
+
# ACTUAL TTS
|
520 |
+
|
521 |
+
|
522 |
+
with open('harvard.json', 'r') as f:
|
523 |
+
harvard_individual_sentences = json.load(f)['sentences']
|
524 |
+
total_audio_mimic3 = []
|
525 |
+
total_audio_stts2 = []
|
526 |
+
ix = 0
|
527 |
+
for list_of_10 in harvard_individual_sentences[:1]: # 77
|
528 |
+
text = ' '.join(list_of_10['sentences'])
|
529 |
+
# harvard.append(long_sentence.replace('.', ' '))
|
530 |
+
# for text in list_of_10['sentences']:
|
531 |
+
style_vec = msinference.compute_style(prompt_path)
|
532 |
+
print(ix, text)
|
533 |
+
ix += 1
|
534 |
+
|
535 |
+
|
536 |
+
x = msinference.inference(text,
|
537 |
+
style_vec,
|
538 |
+
alpha=0.3,
|
539 |
+
beta=0.7,
|
540 |
+
diffusion_steps=7,
|
541 |
+
embedding_scale=1)
|
542 |
+
|
543 |
+
total_audio_stts2.append(x)
|
544 |
+
|
545 |
+
# also synthesize mimic with the same sentence and voice
|
546 |
+
|
547 |
+
# MIMIC-3 = = = = = = = = = = = = = = BEGIN
|
548 |
+
|
549 |
+
rate = 1 # high speed sounds nice if used as speaker-reference audio for StyleTTS2
|
550 |
+
_ssml = (
|
551 |
+
'<speak>'
|
552 |
+
'<prosody volume=\'64\'>'
|
553 |
+
f'<prosody rate=\'{rate}\'>'
|
554 |
+
f'<voice name=\'{_voice}\'>'
|
555 |
+
'<s>'
|
556 |
+
f'{text}'
|
557 |
+
'</s>'
|
558 |
+
'</voice>'
|
559 |
+
'</prosody>'
|
560 |
+
'</prosody>'
|
561 |
+
'</speak>'
|
562 |
+
)
|
563 |
+
with open('_tmp_ssml.txt', 'w') as f:
|
564 |
+
f.write(_ssml)
|
565 |
+
|
566 |
+
|
567 |
+
# ps = subprocess.Popen(f'cat _tmp_ssml.txt | mimic3 --ssml > {reference_wav}', shell=True)
|
568 |
+
# ps.wait() # using ps to call mimic3 because samples dont have time to be written in stdout buffer
|
569 |
+
args = get_args()
|
570 |
+
args.ssml = True
|
571 |
+
args.text = [_ssml] #['aa', 'bb'] #txt
|
572 |
+
args.interactive = False
|
573 |
+
# args.output_naming = OutputNaming.TIME
|
574 |
+
|
575 |
+
state = CommandLineInterfaceState(args=args)
|
576 |
+
initialize_args(state)
|
577 |
+
initialize_tts(state)
|
578 |
+
# args.texts = [txt] #['aa', 'bb'] #txt
|
579 |
+
# state.stdout = '.' #None #'makeme.wav'
|
580 |
+
# state.output_dir = '.noopy'
|
581 |
+
# state.interactive = False
|
582 |
+
# state.output_naming = OutputNaming.TIME
|
583 |
+
# # state.ssml = 1234546575
|
584 |
+
# state.stdout = True
|
585 |
+
# state.tts = True
|
586 |
+
process_lines(state, wav_path='tmp1.wav')
|
587 |
+
shutdown_tts(state)
|
588 |
+
x, fs = audiofile.read('tmp1.wav')
|
589 |
+
total_audio_mimic3.append(x)
|
590 |
+
print(fs, text, 'mimic3')
|
591 |
+
|
592 |
+
# MIMIC3 = = = = = = = = = = = = = = END
|
593 |
+
|
594 |
+
|
595 |
+
|
596 |
+
|
597 |
+
|
598 |
+
|
599 |
+
total_audio_stts2 = np.concatenate(total_audio_stts2) # -- concat 77x lists
|
600 |
+
audiofile.write(_dir + 'styletts2__' + _str + '.wav', total_audio_stts2, 24000)
|
601 |
+
|
602 |
+
total_audio_mimic3 = np.concatenate(total_audio_mimic3) # -- concat 77x lists
|
603 |
+
audiofile.write(_dir + 'mimic3__' + _str + '.wav', total_audio_mimic3, 22050)
|
604 |
+
|
605 |
+
print('Saving:', _dir + 'mimic3__' + _str + '.wav')
|
606 |
+
else:
|
607 |
+
print('Skip:', _dir + 'styletts2__' + _str + '.wav')
|
608 |
+
|
609 |
+
|
610 |
+
# AUD I N T E R F A C E
|
611 |
+
# file_interface = f'timeseries_{long_audio.replace("/", "")}.pkl'
|
612 |
+
for engine in ['mimic3', 'styletts2']:
|
613 |
+
harvard_of_voice = f'{_dir}{engine}__{_str}'
|
614 |
+
if not os.path.exists(harvard_of_voice + '.pkl'):
|
615 |
+
df_pred = interface.process_file(harvard_of_voice + '.wav')
|
616 |
+
df_pred.to_pickle(harvard_of_voice + '.pkl')
|
617 |
+
else:
|
618 |
+
print(harvard_of_voice + '.pkl', 'FOUND')
|