|
|
|
import shutil |
|
import csv |
|
import io |
|
import os |
|
import typing |
|
import wave |
|
import sys |
|
from mimic3_tts.__main__ import (CommandLineInterfaceState, |
|
get_args, |
|
initialize_args, |
|
initialize_tts, |
|
|
|
|
|
shutdown_tts, |
|
OutputNaming, |
|
process_line) |
|
import time |
|
import json |
|
import os |
|
import numpy as np |
|
|
|
from pathlib import Path |
|
import audiofile |
|
|
|
|
|
|
|
ROOT_DIR = '/data/dkounadis/mimic3-voices/' |
|
foreign_voices = [] |
|
english_voices = [] |
|
for lang in os.listdir(ROOT_DIR + 'voices'): |
|
|
|
for voice in os.listdir(ROOT_DIR + 'voices/' + lang): |
|
if 'en_' in lang: |
|
|
|
try: |
|
with open(ROOT_DIR + 'voices/' + lang + '/' + voice + '/speakers.txt', 'r') as f: |
|
for spk in f: |
|
english_voices.append(lang + '/' + voice + '#' + spk.rstrip()) |
|
|
|
except FileNotFoundError: |
|
english_voices.append(lang + '/' + voice) |
|
|
|
else: |
|
|
|
try: |
|
with open(ROOT_DIR + 'voices/' + lang + '/' + voice + '/speakers.txt', 'r') as f: |
|
for spk in f: |
|
foreign_voices.append(lang + '/' + voice + '#' + spk.rstrip()) |
|
|
|
except FileNotFoundError: |
|
foreign_voices.append(lang + '/' + voice) |
|
|
|
[print(i) for i in foreign_voices] |
|
print('\n_______________________________\n') |
|
[print(i) for i in english_voices] |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def process_lines(state: CommandLineInterfaceState, wav_path=None): |
|
'''MIMIC3 INTERNAL CALL that yields the sigh sound''' |
|
|
|
args = state.args |
|
|
|
result_idx = 0 |
|
print(f'why waitings in the for loop LIN {state.texts=}\n') |
|
for line in state.texts: |
|
|
|
line_voice: typing.Optional[str] = None |
|
line_id = "" |
|
line = line.strip() |
|
|
|
|
|
|
|
if args.output_naming == OutputNaming.ID: |
|
|
|
with io.StringIO(line) as line_io: |
|
reader = csv.reader(line_io, delimiter=args.csv_delimiter) |
|
row = next(reader) |
|
line_id, line = row[0], row[-1] |
|
if args.csv_voice: |
|
line_voice = row[1] |
|
|
|
process_line(line, state, |
|
line_id=line_id, |
|
line_voice=line_voice) |
|
result_idx += 1 |
|
time.sleep(4) |
|
|
|
if state.all_audio: |
|
|
|
|
|
if sys.stdout.isatty() and (not state.args.stdout): |
|
with io.BytesIO() as wav_io: |
|
wav_file_play: wave.Wave_write = wave.open(wav_io, "wb") |
|
with wav_file_play: |
|
wav_file_play.setframerate(state.sample_rate_hz) |
|
wav_file_play.setsampwidth(state.sample_width_bytes) |
|
wav_file_play.setnchannels(state.num_channels) |
|
wav_file_play.writeframes(state.all_audio) |
|
|
|
|
|
|
|
with open(wav_path, 'wb') as wav_file: |
|
wav_file.write(wav_io.getvalue()) |
|
wav_file.seek(0) |
|
print('\n\n5T', wav_path) |
|
else: |
|
print('\n\nDOES NOT TTSING --> ADD SOME time.sleep(4)', wav_path) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
for lang, list_voices in [ |
|
['english', english_voices], |
|
['foreign', foreign_voices] |
|
]: |
|
for rate in [1, 4]: |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
with open('harvard.json', 'r') as f: |
|
harvard_individual_sentences = json.load(f)['sentences'] |
|
total_audio_mimic3 = [] |
|
|
|
ix = 0 |
|
for list_of_10 in harvard_individual_sentences[:4]: |
|
|
|
for text in list_of_10['sentences']: |
|
|
|
|
|
_voice = list_voices[ix % len(list_voices)] |
|
_str = _voice.replace('/', '_').replace('#', '_').replace('_low', '') |
|
|
|
if 'cmu-arctic' in _str: |
|
_str = _str.replace('cmu-arctic', 'cmu_arctic') |
|
|
|
print(ix, lang, text) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
_ssml = ( |
|
'<speak>' |
|
'<prosody volume=\'64\'>' |
|
f'<prosody rate=\'{rate}\'>' |
|
f'<voice name=\'{_voice}\'>' |
|
'<s>' |
|
f'{text[:-1] + ", .. !!!"}' |
|
'</s>' |
|
'</voice>' |
|
'</prosody>' |
|
'</prosody>' |
|
'</speak>' |
|
) |
|
with open('_tmp_ssml.txt', 'w') as f: |
|
f.write(_ssml) |
|
|
|
|
|
|
|
|
|
args = get_args() |
|
args.ssml = True |
|
args.text = [_ssml] |
|
args.interactive = False |
|
|
|
|
|
state = CommandLineInterfaceState(args=args) |
|
initialize_args(state) |
|
initialize_tts(state) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
style_path = 'tmp1.wav' |
|
process_lines(state, wav_path=style_path) |
|
shutdown_tts(state) |
|
x, fs = audiofile.read(style_path) |
|
ix += 1 |
|
total_audio_mimic3.append(x) |
|
|
|
|
|
|
|
|
|
|
|
|
|
total_audio_mimic3 = np.concatenate(total_audio_mimic3) |
|
audiofile.write(f'harvards_upload_mimic3_{rate}_{lang}.wav', total_audio_mimic3, 22050) |
|
|
|
print(total_audio_mimic3.shape, 'LEN\n') |
|
|
|
|
|
|
|
|