# 1. Syntesize Harvard Sentences via Mimic-3 - 1 voice | |
# 1. Synthesize via StyleTTS2 --> use same or sweetdreams | |
# 2. Run audinterface on this 767 | |
# 3. .mimic3_pkl .styletts2_pkl -> different durations | |
# It may crash due to non-truly-blocking shutil.copyfile() saying onnx protobuf incomplete file | |
# You have to rerun the script - it will copy all voices from hf:mimic3-voices to ~/.local/mimic3 | |
import shutil | |
import csv | |
import io | |
import os | |
import typing | |
import wave | |
import sys | |
from mimic3_tts.__main__ import (CommandLineInterfaceState, | |
get_args, | |
initialize_args, | |
initialize_tts, | |
# print_voices, | |
# process_lines, | |
shutdown_tts, | |
OutputNaming, | |
process_line) | |
import msinference | |
import time | |
import json | |
import pandas as pd | |
import os | |
import numpy as np | |
import audonnx | |
import audb | |
from pathlib import Path | |
import transformers | |
import torch | |
import audmodel | |
import audinterface | |
import matplotlib.pyplot as plt | |
import audiofile | |
# ================================================ LIST OF VOICES | |
ROOT_DIR = '/data/dkounadis/mimic3-voices/' | |
foreign_voices = [] | |
english_voices = [] | |
for lang in os.listdir(ROOT_DIR + 'voices'): | |
for voice in os.listdir(ROOT_DIR + 'voices/' + lang): | |
if 'en_' in lang: | |
try: | |
with open(ROOT_DIR + 'voices/' + lang + '/' + voice + '/speakers.txt', 'r') as f: | |
for spk in f: | |
english_voices.append(lang + '/' + voice + '#' + spk.rstrip()) | |
# voice_spk_string = lang + '/' + voice + '#' + spk.rstrip() for spk in f | |
except FileNotFoundError: | |
english_voices.append(lang + '/' + voice) | |
else: | |
try: | |
with open(ROOT_DIR + 'voices/' + lang + '/' + voice + '/speakers.txt', 'r') as f: | |
for spk in f: | |
foreign_voices.append(lang + '/' + voice + '#' + spk.rstrip()) | |
except FileNotFoundError: | |
foreign_voices.append(lang + '/' + voice) | |
# ================================================== INTERFACE MODELS | |
LABELS = [ | |
'arousal', 'dominance', 'valence', | |
# 'speech_synthesizer', 'synthetic_singing', | |
'Angry', | |
'Sad', | |
'Happy', | |
'Surprise', | |
'Fear', | |
'Disgust', | |
'Contempt', | |
'Neutral' | |
] | |
args = transformers.Wav2Vec2Config() #finetuning_task='spef2feat_reg') | |
args.dev = torch.device('cuda:0') | |
args.dev2 = torch.device('cuda:0') | |
def _softmax(x): | |
'''x : (batch, num_class)''' | |
x -= x.max(1, keepdims=True) # if all -400 then sum(exp(x)) = 0 | |
x = np.maximum(-100, x) | |
x = np.exp(x) | |
x /= x.sum(1, keepdims=True) | |
return x | |
from transformers import AutoModelForAudioClassification | |
import types | |
def _infer(self, x): | |
'''x: (batch, audio-samples-16KHz)''' | |
x = (x + self.config.mean) / self.config.std # plus | |
x = self.ssl_model(x, attention_mask=None).last_hidden_state | |
# pool | |
h = self.pool_model.sap_linear(x).tanh() | |
w = torch.matmul(h, self.pool_model.attention) | |
w = w.softmax(1) | |
mu = (x * w).sum(1) | |
x = torch.cat( | |
[ | |
mu, | |
((x * x * w).sum(1) - mu * mu).clamp(min=1e-7).sqrt() | |
], 1) | |
return self.ser_model(x) | |
teacher_cat = AutoModelForAudioClassification.from_pretrained( | |
'3loi/SER-Odyssey-Baseline-WavLM-Categorical-Attributes', | |
trust_remote_code=True # fun definitions see 3loi/SER-.. repo | |
).to(args.dev2).eval() | |
teacher_cat.forward = types.MethodType(_infer, teacher_cat) | |
# Audioset & ADV | |
# audioset_model = audonnx.load(audmodel.load('17c240ec-1.0.0'), device='cuda:0') | |
adv_model = audonnx.load(audmodel.load('90398682-2.0.0'), device='cuda:0') | |
def process_function(x, sampling_rate, idx): | |
'''run audioset ct, adv | |
USE onnx teachers | |
return [synth-speech, synth-singing, 7x, 3x adv] = 11 | |
''' | |
# x = x[None , :] ASaHSuFDCN | |
#{0: 'Angry', 1: 'Sad', 2: 'Happy', 3: 'Surprise', | |
#4: 'Fear', 5: 'Disgust', 6: 'Contempt', 7: 'Neutral'} | |
#tensor([[0.0015, 0.3651, 0.0593, 0.0315, 0.0600, 0.0125, 0.0319, 0.4382]]) | |
logits_cat = teacher_cat(torch.from_numpy(x).to(args.dev)).cpu().detach().numpy() | |
# USE ALL CATEGORIES | |
# -- | |
# logits_audioset = audioset_model(x, 16000)['logits_sounds'] | |
# logits_audioset = logits_audioset[:, [7, 35]] # speech synthesizer synthetic singing | |
# -- | |
logits_adv = adv_model(x, 16000)['logits'] | |
cat = np.concatenate([logits_adv, | |
# _sigmoid(logits_audioset), | |
_softmax(logits_cat)], | |
1) | |
print(cat) | |
return cat #logits_adv #model(signal, sampling_rate)['logits'] | |
interface = audinterface.Feature( | |
feature_names=LABELS, | |
process_func=process_function, | |
# process_func_args={'outputs': 'logits_scene'}, | |
process_func_applies_sliding_window=False, | |
win_dur=4.0, | |
hop_dur=1.0, | |
sampling_rate=16000, | |
resample=True, | |
verbose=True, | |
) | |
# ======================================== END INTERFACE | |
# Filter insufficient durations - prompt | |
foreign_voices = [i for i in foreign_voices if i not in ['bn/multi_low#02194', | |
'uk_UK/m-ailabs_low#obruchov', | |
'uk_UK/m-ailabs_low#shepel', | |
'uk_UK/m-ailabs_low#loboda', | |
'uk_UK/m-ailabs_low#miskun', | |
'uk_UK/m-ailabs_low#sumska', | |
'uk_UK/m-ailabs_low#pysariev', | |
]] | |
# print(english_voices, '\n_________________________\n', foreign_voices) | |
# ---------------------- | |
# print(foreign_voices.keys(), len(foreign_voices)) | |
# raise SystemExit | |
def process_lines(state: CommandLineInterfaceState, wav_path=None): | |
'''MIMIC3 INTERNAL CALL that yields the sigh sound''' | |
args = state.args | |
result_idx = 0 | |
print(f'why waitings in the for loop LIN {state.texts=}\n') | |
for line in state.texts: | |
# print(f'LIN {line=}\n') # prints \n so is empty not getting the predifne text of state.texts | |
line_voice: typing.Optional[str] = None | |
line_id = "" | |
line = line.strip() | |
# if not line: | |
# continue | |
if args.output_naming == OutputNaming.ID: | |
# Line has the format id|text instead of just text | |
with io.StringIO(line) as line_io: | |
reader = csv.reader(line_io, delimiter=args.csv_delimiter) | |
row = next(reader) | |
line_id, line = row[0], row[-1] | |
if args.csv_voice: | |
line_voice = row[1] | |
process_line(line, state, line_id=line_id, line_voice=line_voice) | |
result_idx += 1 | |
time.sleep(4) | |
# Write combined audio to stdout | |
if state.all_audio: | |
# _LOGGER.debug("Writing WAV audio to stdout") | |
if sys.stdout.isatty() and (not state.args.stdout): | |
with io.BytesIO() as wav_io: | |
wav_file_play: wave.Wave_write = wave.open(wav_io, "wb") | |
with wav_file_play: | |
wav_file_play.setframerate(state.sample_rate_hz) | |
wav_file_play.setsampwidth(state.sample_width_bytes) | |
wav_file_play.setnchannels(state.num_channels) | |
wav_file_play.writeframes(state.all_audio) | |
# play_wav_bytes(state.args, wav_io.getvalue()) | |
# wav_path = '_direct_call_2.wav' | |
with open(wav_path, 'wb') as wav_file: | |
wav_file.write(wav_io.getvalue()) | |
wav_file.seek(0) | |
print('\n\n5T', wav_path) | |
else: | |
print('\n\nDOES NOT TTSING --> ADD SOME time.sleep(4)', wav_path) | |
# ----------------------------------------------------------------------------- | |
# cat _tmp_ssml.txt | mimic3 --cuda --ssml --noise-w 0.90001 --length-scale 0.91 --noise-scale 0.04 > noise_w=0.90_en_happy_2.wav | |
# ====================================================================== | |
# END DEF | |
# https://huggingface.co/dkounadis/artificial-styletts2/tree/main/mimic3_foreign | |
# STYLES Already Made - HF | |
english_dir = 'english_pkl/' | |
foreign_dir = 'foreign_pkl/' | |
Path(english_dir).mkdir(parents=True, exist_ok=True) | |
Path(foreign_dir).mkdir(parents=True, exist_ok=True) | |
# # synth 767 | |
# for _id, _voice in enumerate(foreign_voices): | |
# _str = _voice.replace('/', '_').replace('#', '_').replace('_low', '') | |
# if 'cmu-arctic' in _str: | |
# _str = _str.replace('cmu-arctic', 'cmu_arctic') #+ '.wav' | |
# print('\n\n\n\nExecuting', _voice,'\n\n\n\n\n') | |
# if ( | |
# not os.path.isfile(foreign_dir + 'mimic3__' + _str + '.wav') or | |
# not os.path.isfile(foreign_dir + 'styletts2__' + _str + '.wav') | |
# ): | |
# # Mimic3 GitHub Quota exceded: | |
# # https://github.com/MycroftAI/mimic3-voices | |
# # Above repo can exceed download quota of LFS | |
# # Copy mimic-voices from local copies | |
# # clone https://huggingface.co/mukowaty/mimic3-voices/tree/main/voices | |
# # copy to ~/ | |
# # | |
# # | |
# home_voice_dir = f'/home/audeering.local/dkounadis/.local/share/mycroft/mimic3/voices/{_voice.split("#")[0]}/' | |
# Path(home_voice_dir).mkdir(parents=True, exist_ok=True) | |
# speaker_free_voice_name = _voice.split("#")[0] if '#' in _voice else _voice | |
# if ( | |
# (not os.path.isfile(home_voice_dir + 'generator.onnx')) or | |
# (os.path.getsize(home_voice_dir + 'generator.onnx') < 500) # .onnx - is just LFS header | |
# ): | |
# # Copy | |
# shutil.copyfile( | |
# f'/data/dkounadis/mimic3-voices/voices/{speaker_free_voice_name}/generator.onnx', | |
# home_voice_dir + 'generator.onnx') | |
# # pre made | |
# prompt_path = 'mimic3_foreign_4x/' + _str + '.wav' | |
# # =========================================================================== HARVRAD wav | |
# with open('harvard.json', 'r') as f: | |
# harvard_individual_sentences = json.load(f)['sentences'] | |
# total_audio_mimic3 = [] | |
# total_audio_stts2 = [] | |
# ix = 0 | |
# for list_of_10 in harvard_individual_sentences[:1]: # 77 | |
# text = ' '.join(list_of_10['sentences']) | |
# # harvard.append(long_sentence.replace('.', ' ')) | |
# # for text in list_of_10['sentences']: | |
# style_vec = msinference.compute_style(prompt_path) | |
# print(ix, text) | |
# ix += 1 | |
# x = msinference.inference(text, | |
# style_vec, | |
# alpha=0.3, | |
# beta=0.7, | |
# diffusion_steps=7, | |
# embedding_scale=1) | |
# total_audio_stts2.append(x) | |
# # also synthesize mimic with the same sentence and voice | |
# # MIMIC-3 = = = = = = = = = = = = = = BEGIN | |
# rate = 1 # high speed sounds nice if used as speaker-reference audio for StyleTTS2 | |
# _ssml = ( | |
# '<speak>' | |
# '<prosody volume=\'64\'>' | |
# f'<prosody rate=\'{rate}\'>' | |
# f'<voice name=\'{_voice}\'>' | |
# '<s>' | |
# f'{text}' | |
# '</s>' | |
# '</voice>' | |
# '</prosody>' | |
# '</prosody>' | |
# '</speak>' | |
# ) | |
# with open('_tmp_ssml.txt', 'w') as f: | |
# f.write(_ssml) | |
# # ps = subprocess.Popen(f'cat _tmp_ssml.txt | mimic3 --ssml > {reference_wav}', shell=True) | |
# # ps.wait() # using ps to call mimic3 because samples dont have time to be written in stdout buffer | |
# args = get_args() | |
# args.ssml = True | |
# args.text = [_ssml] #['aa', 'bb'] #txt | |
# args.interactive = False | |
# # args.output_naming = OutputNaming.TIME | |
# state = CommandLineInterfaceState(args=args) | |
# initialize_args(state) | |
# initialize_tts(state) | |
# # args.texts = [txt] #['aa', 'bb'] #txt | |
# # state.stdout = '.' #None #'makeme.wav' | |
# # state.output_dir = '.noopy' | |
# # state.interactive = False | |
# # state.output_naming = OutputNaming.TIME | |
# # # state.ssml = 1234546575 | |
# # state.stdout = True | |
# # state.tts = True | |
# process_lines(state, wav_path='tmp1.wav') | |
# shutdown_tts(state) | |
# x, fs = audiofile.read('tmp1.wav') | |
# total_audio_mimic3.append(x) | |
# print(fs, text, 'mimic3') | |
# # MIMIC3 = = = = = = = = = = = = = = END | |
# total_audio_stts2 = np.concatenate(total_audio_stts2) # -- concat 77x lists | |
# audiofile.write(foreign_dir + 'styletts2__' + _str + '.wav', total_audio_stts2, 24000) | |
# total_audio_mimic3 = np.concatenate(total_audio_mimic3) # -- concat 77x lists | |
# audiofile.write(foreign_dir + 'mimic3__' + _str + '.wav', total_audio_mimic3, 22050) | |
# print('Saving:', foreign_dir + 'mimic3__' + _str + '.wav') | |
# else: | |
# print('Skip:', foreign_dir + 'styletts2__' + _str + '.wav') | |
# load all harvard and for every voice -> load-its-style -> synth-mimic3 -> synth-stylett2 -> run-both-pkl | |
# FOREIGN | |
for folder, list_voices in [ | |
['foreign', foreign_voices], | |
['english', english_voices], | |
]: | |
print(folder, list_voices[:4], '\n\nEEEEEEEEEEEEEEEEEEEEEEEEEEEEEE') | |
for _id, _voice in enumerate(list_voices[:4]): | |
_str = _voice.replace('/', '_').replace('#', '_').replace('_low', '') | |
_dir = folder + '_pkl/' | |
if 'cmu-arctic' in _str: | |
_str = _str.replace('cmu-arctic', 'cmu_arctic') #+ '.wav' | |
print('\n\n\n\nExecuting', _voice,'\n\n\n\n\n') | |
if ( | |
not os.path.isfile(_dir + 'mimic3__' + _str + '.wav') or | |
not os.path.isfile(_dir + 'styletts2__' + _str + '.wav') | |
): | |
# Mimic3 GitHub Quota exceded: | |
# https://github.com/MycroftAI/mimic3-voices | |
# Above repo can exceed download quota of LFS | |
# Copy mimic-voices from local copies | |
# clone https://huggingface.co/mukowaty/mimic3-voices/tree/main/voices | |
# copy to ~/ | |
# | |
# | |
home_voice_dir = f'/home/audeering.local/dkounadis/.local/share/mycroft/mimic3/voices/{_voice.split("#")[0]}/' | |
Path(home_voice_dir).mkdir(parents=True, exist_ok=True) | |
speaker_free_voice_name = _voice.split("#")[0] if '#' in _voice else _voice | |
if ( | |
(not os.path.isfile(home_voice_dir + 'generator.onnx')) or | |
(os.path.getsize(home_voice_dir + 'generator.onnx') < 500) # .onnx - is just LFS header | |
): | |
# Copy | |
shutil.copyfile( | |
f'/data/dkounadis/mimic3-voices/voices/{speaker_free_voice_name}/generator.onnx', | |
home_voice_dir + 'generator.onnx') | |
# pre made | |
prompt_path = f'mimic3_{folder}_4x/' + _str + '.wav' | |
# ACTUAL TTS | |
with open('harvard.json', 'r') as f: | |
harvard_individual_sentences = json.load(f)['sentences'] | |
total_audio_mimic3 = [] | |
total_audio_stts2 = [] | |
ix = 0 | |
for list_of_10 in harvard_individual_sentences[:1]: # 77 | |
text = ' '.join(list_of_10['sentences']) | |
# harvard.append(long_sentence.replace('.', ' ')) | |
# for text in list_of_10['sentences']: | |
style_vec = msinference.compute_style(prompt_path) | |
print(ix, text) | |
ix += 1 | |
x = msinference.inference(text, | |
style_vec, | |
alpha=0.3, | |
beta=0.7, | |
diffusion_steps=7, | |
embedding_scale=1) | |
total_audio_stts2.append(x) | |
# also synthesize mimic with the same sentence and voice | |
# MIMIC-3 = = = = = = = = = = = = = = BEGIN | |
rate = 1 # high speed sounds nice if used as speaker-reference audio for StyleTTS2 | |
_ssml = ( | |
'<speak>' | |
'<prosody volume=\'64\'>' | |
f'<prosody rate=\'{rate}\'>' | |
f'<voice name=\'{_voice}\'>' | |
'<s>' | |
f'{text}' | |
'</s>' | |
'</voice>' | |
'</prosody>' | |
'</prosody>' | |
'</speak>' | |
) | |
with open('_tmp_ssml.txt', 'w') as f: | |
f.write(_ssml) | |
# ps = subprocess.Popen(f'cat _tmp_ssml.txt | mimic3 --ssml > {reference_wav}', shell=True) | |
# ps.wait() # using ps to call mimic3 because samples dont have time to be written in stdout buffer | |
args = get_args() | |
args.ssml = True | |
args.text = [_ssml] #['aa', 'bb'] #txt | |
args.interactive = False | |
# args.output_naming = OutputNaming.TIME | |
state = CommandLineInterfaceState(args=args) | |
initialize_args(state) | |
initialize_tts(state) | |
# args.texts = [txt] #['aa', 'bb'] #txt | |
# state.stdout = '.' #None #'makeme.wav' | |
# state.output_dir = '.noopy' | |
# state.interactive = False | |
# state.output_naming = OutputNaming.TIME | |
# # state.ssml = 1234546575 | |
# state.stdout = True | |
# state.tts = True | |
process_lines(state, wav_path='tmp1.wav') | |
shutdown_tts(state) | |
x, fs = audiofile.read('tmp1.wav') | |
total_audio_mimic3.append(x) | |
print(fs, text, 'mimic3') | |
# MIMIC3 = = = = = = = = = = = = = = END | |
total_audio_stts2 = np.concatenate(total_audio_stts2) # -- concat 77x lists | |
audiofile.write(_dir + 'styletts2__' + _str + '.wav', total_audio_stts2, 24000) | |
total_audio_mimic3 = np.concatenate(total_audio_mimic3) # -- concat 77x lists | |
audiofile.write(_dir + 'mimic3__' + _str + '.wav', total_audio_mimic3, 22050) | |
print('Saving:', _dir + 'mimic3__' + _str + '.wav') | |
else: | |
print('Skip:', _dir + 'styletts2__' + _str + '.wav') | |
# AUD I N T E R F A C E | |
# file_interface = f'timeseries_{long_audio.replace("/", "")}.pkl' | |
for engine in ['mimic3', 'styletts2']: | |
harvard_of_voice = f'{_dir}{engine}__{_str}' | |
if not os.path.exists(harvard_of_voice + '.pkl'): | |
df_pred = interface.process_file(harvard_of_voice + '.wav') | |
df_pred.to_pickle(harvard_of_voice + '.pkl') | |
else: | |
print(harvard_of_voice + '.pkl', 'FOUND') |