xVASynth-TTS / app.py
Pendrokar's picture
robotic hf voice model
79e67e9
raw
history blame
6.16 kB
import os
import sys
import requests
import json
from huggingface_hub import HfApi
# start xVASynth service (no HTTP)
import resources.app.no_server as xvaserver
from gr_client import BlocksDemo
# NVIDIA NeMo models
hf_model_name = "Pendrokar/xvapitch_nvidia"
model_repo = HfApi()
commits = model_repo.list_repo_commits(repo_id=hf_model_name)
latest_commit_sha = commits[0].commit_id
hf_cache_models_path = f'/home/user/.cache/huggingface/hub/models--Pendrokar--xvapitch_nvidia/snapshots/{latest_commit_sha}/'
models_path = hf_cache_models_path
# Lojban model
commits = model_repo.list_repo_commits(repo_id='Pendrokar/xvasynth_lojban')
latest_commit_sha = commits[0].commit_id
hf_cache_lojban_models_path = f'/home/user/.cache/huggingface/hub/models--Pendrokar--xvasynth_lojban/snapshots/{latest_commit_sha}/'
# Robotic model
hf_cache_robotic_models_path = ''
try:
commits = model_repo.list_repo_commits(repo_id='Pendrokar/xvasynth_cabal', token=os.getenv('HF_TOKEN'))
latest_commit_sha = commits[0].commit_id
hf_cache_robotic_models_path = f'/home/user/.cache/huggingface/hub/models--Pendrokar--xvasynth_cabal/snapshots/{latest_commit_sha}/'
except:
print('Robotic voice not loaded!')
pass
current_voice_model = None
current_voice_type = None
base_speaker_emb = ''
def load_model(voice_model_name):
global current_voice_model, current_voice_type, base_speaker_emb
if voice_model_name == 'x_selpahi':
# Lojban
model_path = hf_cache_lojban_models_path + voice_model_name
model_type = 'FastPitch1.1'
else:
if voice_model_name == 'cnc_cabal':
model_path = hf_cache_robotic_models_path + voice_model_name
else:
model_path = models_path + voice_model_name
model_type = 'xVAPitch'
language = 'en' # seems to have no effect if generated text is from a different language
data = {
'outputs': None,
'version': '3.0',
'model': model_path,
'modelType': model_type,
'base_lang': language,
'pluginsContext': '{}',
}
print('Loading voice model...')
try:
json_data = xvaserver.loadModel(data)
current_voice_model = voice_model_name
current_voice_type = model_type
with open(model_path + '.json', 'r', encoding='utf-8') as f:
voice_model_json = json.load(f)
if model_type == 'xVAPitch':
base_speaker_emb = voice_model_json['games'][0]['base_speaker_emb']
elif model_type == 'FastPitch1.1':
base_speaker_emb = voice_model_json['games'][0]['resemblyzer']
except requests.exceptions.RequestException as err:
print(f'FAILED to load voice model: {err}')
return base_speaker_emb
class LocalBlocksDemo(BlocksDemo):
def predict(
self,
input_text,
voice,
lang,
pacing,
pitch,
energy,
anger,
happy,
sad,
surprise,
use_deepmoji
):
global current_voice_model, current_voice_type, base_speaker_emb
# grab only the first 1000 characters
input_text = input_text[:1000]
# load voice model if not the current model
if (current_voice_model != voice):
load_model(voice)
model_type = current_voice_type
pace = pacing if pacing else 1.0
save_path = '/tmp/xvapitch_audio_sample.wav'
language = lang
use_sr = 0
use_cleanup = 0
pluginsContext = {}
pluginsContext["mantella_settings"] = {
"emAngry": (anger if anger > 0 else 0),
"emHappy": (happy if happy > 0 else 0),
"emSad": (sad if sad > 0 else 0),
"emSurprise": (surprise if surprise > 0 else 0),
"run_model": use_deepmoji
}
data = {
'pluginsContext': json.dumps(pluginsContext),
'modelType': model_type,
# pad with whitespaces as a workaround to avoid cutoffs
'sequence': input_text.center(len(input_text) + 2, ' '),
'pace': pace,
'outfile': save_path,
'vocoder': 'n/a',
'base_lang': language,
'base_emb': base_speaker_emb,
'useSR': use_sr,
'useCleanup': use_cleanup,
}
print('Synthesizing...')
try:
json_data = xvaserver.synthesize(data)
# response = requests.post('http://0.0.0.0:8008/synthesize', json=data, timeout=60)
# response.raise_for_status() # If the response contains an HTTP error status code, raise an exception
# json_data = json.loads(response.text)
except requests.exceptions.RequestException as err:
print('FAILED to synthesize: {err}')
save_path = ''
response = {'text': '{"message": "Failed"}'}
json_data = {
'arpabet': ['Failed'],
'durations': [0],
'em_anger': anger,
'em_happy': happy,
'em_sad': sad,
'em_surprise': surprise,
}
# print('server.log contents:')
# with open('resources/app/server.log', 'r') as f:
# print(f.read())
arpabet_html = ''
if voice == 'x_selpahi':
em_angry = 0
em_happy = 0
em_sad = 0
em_surprise = 0
else:
arpabet_html = '<h6>ARPAbet & Durations</h6>'
arpabet_html += '<table style="margin: 0 var(--size-2)"><tbody><tr>'
arpabet_nopad = json_data['arpabet'].split('|PAD|')
arpabet_symbols = json_data['arpabet'].split('|')
wpad_len = len(arpabet_symbols)
nopad_len = len(arpabet_nopad)
total_dur_length = 0
for symb_i in range(wpad_len):
if (arpabet_symbols[symb_i] == '<PAD>'):
continue
total_dur_length += float(json_data['durations'][symb_i])
for symb_i in range(wpad_len):
if (arpabet_symbols[symb_i] == '<PAD>'):
continue
arpabet_length = float(json_data['durations'][symb_i])
cell_width = round(arpabet_length / total_dur_length * 100, 2)
arpabet_html += '<td class="arpabet" style="width: '\
+ str(cell_width)\
+'%">'\
+ arpabet_symbols[symb_i]\
+ '</td> '
arpabet_html += '<tr></tbody></table>'
if use_deepmoji:
em_angry = round(json_data['em_angry'][0], 2)
em_happy = round(json_data['em_happy'][0], 2)
em_sad = round(json_data['em_sad'][0], 2)
em_surprise = round(json_data['em_surprise'][0], 2)
else:
em_angry = anger
em_happy = happy
em_sad = sad
em_surprise = surprise
return [
save_path,
arpabet_html,
em_angry,
em_happy,
em_sad,
em_surprise,
json_data
]
if __name__ == "__main__":
print('running custom Gradio interface')
demo = LocalBlocksDemo(models_path, hf_cache_lojban_models_path, hf_cache_robotic_models_path)
demo.block.launch()