import shutil |
import csv |
import io |
import os |
import typing |
import wave |
import sys |
from mimic3_tts.__main__ import (CommandLineInterfaceState, |
get_args, |
initialize_args, |
initialize_tts, |
shutdown_tts, |
OutputNaming, |
process_line) |
import time |
import json |
import pandas as pd |
import os |
import numpy as np |
import audonnx |
import audb |
from pathlib import Path |
import transformers |
import torch |
import audmodel |
import audinterface |
import matplotlib.pyplot as plt |
import audiofile |
ROOT_DIR = '/data/dkounadis/mimic3-voices/' |
foreign_voices = [] |
english_voices = [] |
for lang in os.listdir(ROOT_DIR + 'voices'): |
for voice in os.listdir(ROOT_DIR + 'voices/' + lang): |
if 'en_' in lang: |
try: |
with open(ROOT_DIR + 'voices/' + lang + '/' + voice + '/speakers.txt', 'r') as f: |
for spk in f: |
english_voices.append(lang + '/' + voice + '#' + spk.rstrip()) |
except FileNotFoundError: |
english_voices.append(lang + '/' + voice) |
else: |
try: |
with open(ROOT_DIR + 'voices/' + lang + '/' + voice + '/speakers.txt', 'r') as f: |
for spk in f: |
foreign_voices.append(lang + '/' + voice + '#' + spk.rstrip()) |
except FileNotFoundError: |
foreign_voices.append(lang + '/' + voice) |
LABELS = [ |
'arousal', 'dominance', 'valence', |
'Angry', |
'Sad', |
'Happy', |
'Surprise', |
'Fear', |
'Disgust', |
'Contempt', |
'Neutral' |
] |
config = transformers.Wav2Vec2Config() |
config.dev = torch.device('cuda:0') |
config.dev2 = torch.device('cuda:0') |
def _softmax(x): |
'''x : (batch, num_class)''' |
x -= x.max(1, keepdims=True) |
x = np.maximum(-100, x) |
x = np.exp(x) |
x /= x.sum(1, keepdims=True) |
return x |
from transformers import AutoModelForAudioClassification |
import types |
def _infer(self, x): |
'''x: (batch, audio-samples-16KHz)''' |
x = (x + self.config.mean) / self.config.std |
x = self.ssl_model(x, attention_mask=None).last_hidden_state |
h = self.pool_model.sap_linear(x).tanh() |
w = torch.matmul(h, self.pool_model.attention) |
w = w.softmax(1) |
mu = (x * w).sum(1) |
x = torch.cat( |
[ |
mu, |
((x * x * w).sum(1) - mu * mu).clamp(min=1e-7).sqrt() |
], 1) |
return self.ser_model(x) |
teacher_cat = AutoModelForAudioClassification.from_pretrained( |
'3loi/SER-Odyssey-Baseline-WavLM-Categorical-Attributes', |
trust_remote_code=True |
).to(config.dev2).eval() |
teacher_cat.forward = types.MethodType(_infer, teacher_cat) |
adv_model = audonnx.load(audmodel.load('90398682-2.0.0'), device='cuda:0') |
def process_function(x, sampling_rate, idx): |
'''run audioset ct, adv |
USE onnx teachers |
return [synth-speech, synth-singing, 7x, 3x adv] = 11 |
''' |
logits_cat = teacher_cat(torch.from_numpy(x).to(config.dev)).cpu().detach().numpy() |
logits_adv = adv_model(x, 16000)['logits'] |
cat = np.concatenate([logits_adv, |
_softmax(logits_cat)], |
1) |
print(cat) |
return cat |
interface = audinterface.Feature( |
feature_names=LABELS, |
process_func=process_function, |
process_func_applies_sliding_window=False, |
win_dur=4.0, |
hop_dur=1.0, |
sampling_rate=16000, |
resample=True, |
verbose=True, |
) |
foreign_voices = [i for i in foreign_voices if i not in ['bn/multi_low#02194', |
'uk_UK/m-ailabs_low#obruchov', |
'uk_UK/m-ailabs_low#shepel', |
'uk_UK/m-ailabs_low#loboda', |
'uk_UK/m-ailabs_low#miskun', |
'uk_UK/m-ailabs_low#sumska', |
'uk_UK/m-ailabs_low#pysariev', |
]] |
def process_lines(state: CommandLineInterfaceState, wav_path=None): |
'''MIMIC3 INTERNAL CALL that yields the sigh sound''' |
args = state.args |
result_idx = 0 |
print(f'why waitings in the for loop LIN {state.texts=}\n') |
for line in state.texts: |
line_voice: typing.Optional[str] = None |
line_id = "" |
line = line.strip() |
if args.output_naming == OutputNaming.ID: |
with io.StringIO(line) as line_io: |
reader = csv.reader(line_io, delimiter=args.csv_delimiter) |
row = next(reader) |
line_id, line = row[0], row[-1] |
if args.csv_voice: |
line_voice = row[1] |
process_line(line, state, line_id=line_id, line_voice=line_voice) |
result_idx += 1 |
time.sleep(4) |
if state.all_audio: |
if sys.stdout.isatty() and (not state.args.stdout): |
with io.BytesIO() as wav_io: |
wav_file_play: wave.Wave_write = wave.open(wav_io, "wb") |
with wav_file_play: |
wav_file_play.setframerate(state.sample_rate_hz) |
wav_file_play.setsampwidth(state.sample_width_bytes) |
wav_file_play.setnchannels(state.num_channels) |
wav_file_play.writeframes(state.all_audio) |
with open(wav_path, 'wb') as wav_file: |
wav_file.write(wav_io.getvalue()) |
wav_file.seek(0) |
print('\n\n5T', wav_path) |
else: |
print('\n\nDOES NOT TTSING --> ADD SOME time.sleep(4)', wav_path) |
english_dir = 'english_pkl/' |
foreign_dir = 'foreign_pkl/' |
Path(english_dir).mkdir(parents=True, exist_ok=True) |
Path(foreign_dir).mkdir(parents=True, exist_ok=True) |
for folder, list_voices in [ |
['foreign', foreign_voices], |
['english', english_voices], |
]: |
print(folder, list_voices[:4], '\n\nEEEEEEEEEEEEEEEEEEEEEEEEEEEEEE') |
for _id, _voice in enumerate(list_voices[:4]): |
_str = _voice.replace('/', '_').replace('#', '_').replace('_low', '') |
_dir = folder + '_pkl/' |
if 'cmu-arctic' in _str: |
_str = _str.replace('cmu-arctic', 'cmu_arctic') |
print('\n\n\n\nExecuting', _voice,'\n\n\n\n\n') |
if ( |
not os.path.isfile(_dir + 'mimic3__' + _str + '.wav') or |
not os.path.isfile(_dir + 'styletts2__' + _str + '.wav') |
): |
home_voice_dir = f'/home/audeering.local/dkounadis/.local/share/mycroft/mimic3/voices/{_voice.split("#")[0]}/' |
Path(home_voice_dir).mkdir(parents=True, exist_ok=True) |
speaker_free_voice_name = _voice.split("#")[0] if '#' in _voice else _voice |
if ( |
(not os.path.isfile(home_voice_dir + 'generator.onnx')) or |
(os.path.getsize(home_voice_dir + 'generator.onnx') < 500) |
): |
shutil.copyfile( |
f'/data/dkounadis/mimic3-voices/voices/{speaker_free_voice_name}/generator.onnx', |
home_voice_dir + 'generator.onnx') |
prompt_path = f'mimic3_{folder}_4x/' + _str + '.wav' |
with open('harvard.json', 'r') as f: |
harvard_individual_sentences = json.load(f)['sentences'] |
total_audio_mimic3 = [] |
total_audio_stts2 = [] |
ix = 0 |
for list_of_10 in harvard_individual_sentences[:1]: |
text = ' '.join(list_of_10['sentences']) |
style_vec = msinference.compute_style(prompt_path) |
print(ix, text) |
ix += 1 |
x = msinference.inference(text, |
style_vec, |
alpha=0.3, |
beta=0.7, |
diffusion_steps=7, |
embedding_scale=1) |
total_audio_stts2.append(x) |
rate = 1 |
_ssml = ( |
'<speak>' |
'<prosody volume=\'64\'>' |
f'<prosody rate=\'{rate}\'>' |
f'<voice name=\'{_voice}\'>' |
'<s>' |
f'{text}' |
'</s>' |
'</voice>' |
'</prosody>' |
'</prosody>' |
'</speak>' |
) |
with open('_tmp_ssml.txt', 'w') as f: |
f.write(_ssml) |
args = get_args() |
args.ssml = True |
args.text = [_ssml] |
args.interactive = False |
state = CommandLineInterfaceState(args=args) |
initialize_args(state) |
initialize_tts(state) |
process_lines(state, wav_path='tmp1.wav') |
shutdown_tts(state) |
x, fs = audiofile.read('tmp1.wav') |
total_audio_mimic3.append(x) |
print(fs, text, 'mimic3') |
total_audio_stts2 = np.concatenate(total_audio_stts2) |
audiofile.write(_dir + 'styletts2__' + _str + '.wav', total_audio_stts2, 24000) |
total_audio_mimic3 = np.concatenate(total_audio_mimic3) |
audiofile.write(_dir + 'mimic3__' + _str + '.wav', total_audio_mimic3, 22050) |
print('Saving:', _dir + 'mimic3__' + _str + '.wav') |
else: |
print('Skip:', _dir + 'styletts2__' + _str + '.wav') |
for engine in ['mimic3', 'styletts2']: |
harvard_of_voice = f'{_dir}{engine}__{_str}' |
if not os.path.exists(harvard_of_voice + '.pkl'): |
df = interface.process_file(harvard_of_voice + '.wav') |
df.to_pickle(harvard_of_voice + '.pkl') |
else: |
print(harvard_of_voice + '.pkl', 'FOUND') |
for folder, list_voices in [ |
['foreign', foreign_voices], |
['english', english_voices], |
]: |
print(folder, list_voices[:4], '\n\nVISUALIZING VOICES') |
for _id, _voice in enumerate(list_voices[:4]): |
_str = _voice.replace('/', '_').replace('#', '_').replace('_low', '') |
_dir = folder + '_pkl/' |
if 'cmu-arctic' in _str: |
_str = _str.replace('cmu-arctic', 'cmu_arctic') |
vis_df = {} |
for engine in ['mimic3', 'styletts2']: |
harvard_of_voice = f'{_dir}{engine}__{_str}' |
if not os.path.exists(harvard_of_voice + '.pkl'): |
df = interface.process_file(harvard_of_voice + '.wav') |
df.to_pickle(harvard_of_voice + '.pkl') |
else: |
df = pd.read_pickle(harvard_of_voice + '.pkl') |
print(harvard_of_voice + '.pkl', 'FOUND') |
vis_df[engine] = df |
SHORT = min(len(vis_df['mimic3']), len(vis_df['styletts2'])) |
for k,v in vis_df.items(): |
p = v[:SHORT] |
p.reset_index(inplace= True) |
p.drop(columns=['file','start'], inplace=True) |
p.set_index('end', inplace=True) |
p.index = p.index.map(mapper = (lambda x: x.total_seconds())) |
vis_df[k] = p |
print(vis_df, '\n\n\n\n \n') |
fig, ax = plt.subplots(nrows=10, ncols=2, figsize=(24, 24), |
gridspec_kw={'hspace': 0, 'wspace': .04}) |
time_stamp = vis_df['mimic3'].index.to_numpy() |
for j, dim in enumerate(['arousal', |
'dominance', |
'valence']): |
ax[j, 0].plot(time_stamp, vis_df['mimic3'][dim], |
color=(0,104/255,139/255), |
label='mean_1', |
linewidth=2) |
ax[j, 0].fill_between(time_stamp, |
vis_df['mimic3'][dim], |
vis_df['styletts2'][dim], |
color=(.2,.2,.2), |
alpha=0.244) |
if j == 0: |
ax[j, 0].legend(['StyleTTS2 style mimic3', |
'StyleTTS2 style crema-d'], |
prop={'size': 10}, |
) |
ax[j, 0].set_ylabel(dim.lower(), color=(.4, .4, .4), fontsize=14) |
ax[j, 0].set_ylim([1e-7, .9999]) |
ax[j, 0].set_xticklabels(['' for _ in ax[j, 0].get_xticklabels()]) |
ax[j, 0].set_xlim([time_stamp[0], time_stamp[-1]]) |
ax[j, 0].grid() |
time_stamp = vis_df['styletts2'].index.to_numpy() |
for j, dim in enumerate(['Angry', |
'Sad', |
'Happy', |
'Surprise', |
'Fear', |
'Disgust', |
'Contempt', |
]): |
j = j + 3 |
ax[j, 0].plot(time_stamp, vis_df['mimic3'][dim], |
color=(0,104/255,139/255), |
label='mean_1', |
linewidth=2) |
ax[j, 0].fill_between(time_stamp, |
vis_df['mimic3'][dim], |
vis_df['styletts2'][dim], |
color=(.2,.2,.2), |
alpha=0.244) |
ax[j, 0].set_ylabel(dim.lower(), color=(.4, .4, .4), fontsize=14) |
ax[j, 0].set_ylim([1e-7, .9999]) |
ax[j, 0].set_xlim([time_stamp[0], time_stamp[-1]]) |
ax[j, 0].set_xticklabels(['' for _ in ax[j, 0].get_xticklabels()]) |
ax[j, 0].set_xlabel('767 Harvard Sentences (seconds)', fontsize=16, color=(.4,.4,.4)) |
ax[j, 0].grid() |
plt.savefig(f'bh_{_str}.png', bbox_inches='tight') |
plt.close() |
print('UNCOMMENT msinfereence') |