Dionyssos commited on
Commit
9184afc
1 Parent(s): 27ac763

upload_styles.py

Browse files
Files changed (1) hide show
  1. upload_styles.py +241 -0
upload_styles.py ADDED
@@ -0,0 +1,241 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # https://github.com/audeering/shift/tree/main - MAKE Mimic-3 voice / harvard 1x 4x
2
+ import shutil
3
+ import csv
4
+ import io
5
+ import os
6
+ import typing
7
+ import wave
8
+ import sys
9
+ from mimic3_tts.__main__ import (CommandLineInterfaceState,
10
+ get_args,
11
+ initialize_args,
12
+ initialize_tts,
13
+ # print_voices,
14
+ # process_lines,
15
+ shutdown_tts,
16
+ OutputNaming,
17
+ process_line)
18
+ import time
19
+ import json
20
+ import os
21
+ import numpy as np
22
+
23
+ from pathlib import Path
24
+ import audiofile
25
+
26
+
27
+ # ================================================ LIST OF VOICES
28
+ ROOT_DIR = '/data/dkounadis/mimic3-voices/'
29
+ foreign_voices = []
30
+ english_voices = []
31
+ for lang in os.listdir(ROOT_DIR + 'voices'):
32
+
33
+ for voice in os.listdir(ROOT_DIR + 'voices/' + lang):
34
+ if 'en_' in lang:
35
+
36
+ try:
37
+ with open(ROOT_DIR + 'voices/' + lang + '/' + voice + '/speakers.txt', 'r') as f:
38
+ for spk in f:
39
+ english_voices.append(lang + '/' + voice + '#' + spk.rstrip())
40
+ # voice_spk_string = lang + '/' + voice + '#' + spk.rstrip() for spk in f
41
+ except FileNotFoundError:
42
+ english_voices.append(lang + '/' + voice)
43
+
44
+ else:
45
+
46
+ try:
47
+ with open(ROOT_DIR + 'voices/' + lang + '/' + voice + '/speakers.txt', 'r') as f:
48
+ for spk in f:
49
+ foreign_voices.append(lang + '/' + voice + '#' + spk.rstrip())
50
+
51
+ except FileNotFoundError:
52
+ foreign_voices.append(lang + '/' + voice)
53
+ #
54
+ [print(i) for i in foreign_voices]
55
+ print('\n_______________________________\n')
56
+ [print(i) for i in english_voices]
57
+ # ====================================================== LIST Mimic-3 ALL VOICES
58
+ # list_voices = [
59
+ # 'en_US/m-ailabs_low#mary_ann',
60
+ # 'en_UK/apope_low',
61
+ # 'de_DE/thorsten-emotion_low#neutral', # is the 4x really interesting we can just write it in Section
62
+ # # 'ko_KO/kss_low',
63
+ # 'fr_FR/m-ailabs_low#gilles_g_le_blanc',
64
+
65
+ # #'human',
66
+ # ] # special - for human we load specific style file - no Mimic3 is run
67
+
68
+ # ================================== ====== END INTERFACE
69
+
70
+
71
+
72
+
73
+
74
+ def process_lines(state: CommandLineInterfaceState, wav_path=None):
75
+ '''MIMIC3 INTERNAL CALL that yields the sigh sound'''
76
+
77
+ args = state.args
78
+
79
+ result_idx = 0
80
+ print(f'why waitings in the for loop LIN {state.texts=}\n')
81
+ for line in state.texts:
82
+ # print(f'LIN {line=}\n') # prints \n so is empty not getting the predifne text of state.texts
83
+ line_voice: typing.Optional[str] = None
84
+ line_id = ""
85
+ line = line.strip()
86
+ # if not line:
87
+ # continue
88
+
89
+ if args.output_naming == OutputNaming.ID:
90
+ # Line has the format id|text instead of just text
91
+ with io.StringIO(line) as line_io:
92
+ reader = csv.reader(line_io, delimiter=args.csv_delimiter)
93
+ row = next(reader)
94
+ line_id, line = row[0], row[-1]
95
+ if args.csv_voice:
96
+ line_voice = row[1]
97
+
98
+ process_line(line, state,
99
+ line_id=line_id,
100
+ line_voice=line_voice)
101
+ result_idx += 1
102
+ time.sleep(4)
103
+ # Write combined audio to stdout
104
+ if state.all_audio:
105
+ # _LOGGER.debug("Writing WAV audio to stdout")
106
+
107
+ if sys.stdout.isatty() and (not state.args.stdout):
108
+ with io.BytesIO() as wav_io:
109
+ wav_file_play: wave.Wave_write = wave.open(wav_io, "wb")
110
+ with wav_file_play:
111
+ wav_file_play.setframerate(state.sample_rate_hz)
112
+ wav_file_play.setsampwidth(state.sample_width_bytes)
113
+ wav_file_play.setnchannels(state.num_channels)
114
+ wav_file_play.writeframes(state.all_audio)
115
+
116
+ # play_wav_bytes(state.args, wav_io.getvalue())
117
+ # wav_path = '_direct_call_2.wav'
118
+ with open(wav_path, 'wb') as wav_file:
119
+ wav_file.write(wav_io.getvalue())
120
+ wav_file.seek(0)
121
+ print('\n\n5T', wav_path)
122
+ else:
123
+ print('\n\nDOES NOT TTSING --> ADD SOME time.sleep(4)', wav_path)
124
+
125
+ # -----------------------------------------------------------------------------
126
+ # cat _tmp_ssml.txt | mimic3 --cuda --ssml --noise-w 0.90001 --length-scale 0.91 --noise-scale 0.04 > noise_w=0.90_en_happy_2.wav
127
+ # ======================================================================
128
+
129
+
130
+
131
+
132
+ for lang, list_voices in [
133
+ ['english', english_voices],
134
+ ['foreign', foreign_voices]
135
+ ]:
136
+ for rate in [1, 4]:
137
+
138
+
139
+
140
+
141
+ # # --
142
+ # # assure mimic-3 generator .onnx exists
143
+ # home_voice_dir = f'/home/audeering.local/dkounadis/.local/share/mycroft/mimic3/voices/{_voice.split("#")[0]}/'
144
+ # Path(home_voice_dir).mkdir(parents=True, exist_ok=True)
145
+ # speaker_free_voice_name = _voice.split("#")[0] if '#' in _voice else _voice
146
+
147
+
148
+ # if (
149
+ # (not os.path.isfile(home_voice_dir + 'generator.onnx')) or
150
+ # (os.path.getsize(home_voice_dir + 'generator.onnx') < 500) # .onnx - is just LFS header
151
+ # ):
152
+
153
+ # # Copy
154
+
155
+ # shutil.copyfile(
156
+ # f'/data/dkounadis/mimic3-voices/voices/{speaker_free_voice_name}/generator.onnx',
157
+ # home_voice_dir + 'generator.onnx')
158
+ # # --
159
+
160
+
161
+
162
+
163
+ with open('harvard.json', 'r') as f:
164
+ harvard_individual_sentences = json.load(f)['sentences']
165
+ total_audio_mimic3 = []
166
+
167
+ ix = 0
168
+ for list_of_10 in harvard_individual_sentences[:4]: # 77
169
+ # text = ' '.join(list_of_10['sentences'])
170
+ for text in list_of_10['sentences']:
171
+
172
+
173
+ _voice = list_voices[ix % len(list_voices)]
174
+ _str = _voice.replace('/', '_').replace('#', '_').replace('_low', '')
175
+
176
+ if 'cmu-arctic' in _str:
177
+ _str = _str.replace('cmu-arctic', 'cmu_arctic') #+ '.wav'
178
+
179
+ print(ix, lang, text)
180
+
181
+
182
+
183
+ # Synthesis Mimic-3 then use it as prompt for StyleTTS2
184
+
185
+ # MIMIC-3 if _voice is not HUMAN
186
+
187
+ _ssml = (
188
+ '<speak>'
189
+ '<prosody volume=\'64\'>'
190
+ f'<prosody rate=\'{rate}\'>'
191
+ f'<voice name=\'{_voice}\'>'
192
+ '<s>'
193
+ f'{text[:-1] + ", .. !!!"}'
194
+ '</s>'
195
+ '</voice>'
196
+ '</prosody>'
197
+ '</prosody>'
198
+ '</speak>'
199
+ )
200
+ with open('_tmp_ssml.txt', 'w') as f:
201
+ f.write(_ssml)
202
+
203
+
204
+ # ps = subprocess.Popen(f'cat _tmp_ssml.txt | mimic3 --ssml > {reference_wav}', shell=True)
205
+ # ps.wait() # using ps to call mimic3 because samples dont have time to be written in stdout buffer
206
+ args = get_args()
207
+ args.ssml = True
208
+ args.text = [_ssml] #['aa', 'bb'] #txt
209
+ args.interactive = False
210
+ # args.output_naming = OutputNaming.TIME
211
+
212
+ state = CommandLineInterfaceState(args=args)
213
+ initialize_args(state)
214
+ initialize_tts(state)
215
+ # args.texts = [txt] #['aa', 'bb'] #txt
216
+ # state.stdout = '.' #None #'makeme.wav'
217
+ # state.output_dir = '.noopy'
218
+ # state.interactive = False
219
+ # state.output_naming = OutputNaming.TIME
220
+ # # state.ssml = 1234546575
221
+ # state.stdout = True
222
+ # state.tts = True
223
+ style_path = 'tmp1.wav'
224
+ process_lines(state, wav_path=style_path)
225
+ shutdown_tts(state)
226
+ x, fs = audiofile.read(style_path)
227
+ ix += 1
228
+ total_audio_mimic3.append(x)
229
+
230
+
231
+ # save styletts2 .wav
232
+
233
+
234
+
235
+ total_audio_mimic3 = np.concatenate(total_audio_mimic3) # -- concat 77x lists
236
+ audiofile.write(f'harvards_upload_mimic3_{rate}_{lang}.wav', total_audio_mimic3, 22050)
237
+
238
+ print(total_audio_mimic3.shape, 'LEN\n')
239
+
240
+
241
+