Spaces:

Pendrokar
/

xVASynth-TTS

Running on CPU Upgrade

App Files Files Community

xVASynth-TTS / app.py

Pendrokar

fastpitch synthesis fix

f70eab2 8 months ago

raw

history blame

4.85 kB

	import os
	import sys
	import requests
	import json
	from huggingface_hub import HfApi

	# start xVASynth service (no HTTP)
	import resources.app.no_server as xvaserver

	from gr_client import BlocksDemo

	# model
	hf_model_name = "Pendrokar/xvapitch_nvidia"
	model_repo = HfApi()
	commits = model_repo.list_repo_commits(repo_id=hf_model_name)
	latest_commit_sha = commits[0].commit_id
	hf_cache_models_path = f'/home/user/.cache/huggingface/hub/models--Pendrokar--xvapitch_nvidia/snapshots/{latest_commit_sha}/'

	commits = model_repo.list_repo_commits(repo_id='Pendrokar/xvasynth_lojban')
	latest_commit_sha = commits[0].commit_id
	hf_cache_lojban_models_path = f'/home/user/.cache/huggingface/hub/models--Pendrokar--xvasynth_lojban/snapshots/{latest_commit_sha}/'
	models_path = hf_cache_models_path

	current_voice_model = None
	current_voice_type = None
	base_speaker_emb = ''

	def load_model(voice_model_name):
	if voice_model_name == 'x_selpahi':
	# Lojban
	model_path = hf_cache_lojban_models_path + voice_model_name
	model_type = 'FastPitch1.1'
	else:
	model_path = models_path + voice_model_name
	model_type = 'xVAPitch'

	language = 'en' # seems to have no effect if generated text is from a different language

	data = {
	'outputs': None,
	'version': '3.0',
	'model': model_path,
	'modelType': model_type,
	'base_lang': language,
	'pluginsContext': '{}',
	}

	embs = base_speaker_emb

	print('Loading voice model...')
	try:
	json_data = xvaserver.loadModel(data)
	current_voice_model = voice_model_name
	current_voice_type = model_type

	with open(model_path + '.json', 'r', encoding='utf-8') as f:
	voice_model_json = json.load(f)

	if model_type == 'xVAPitch':
	embs = voice_model_json['games'][0]['base_speaker_emb']
	elif model_type == 'FastPitch1.1':
	embs = voice_model_json['games'][0]['resemblyzer']
	except requests.exceptions.RequestException as err:
	print(f'FAILED to load voice model: {err}')

	return embs


	class LocalBlocksDemo(BlocksDemo):
	def predict(
	self,
	input_text,
	voice,
	lang,
	pacing,
	pitch,
	energy,
	anger,
	happy,
	sad,
	surprise,
	use_deepmoji
	):
	# grab only the first 1000 characters
	input_text = input_text[:1000]

	# load voice model if not the current model
	if (current_voice_model != voice):
	base_speaker_emb = load_model(voice)

	model_type = current_voice_type
	pace = pacing if pacing else 1.0
	save_path = '/tmp/xvapitch_audio_sample.wav'
	language = lang
	use_sr = 0
	use_cleanup = 0

	pluginsContext = {}
	pluginsContext["mantella_settings"] = {
	"emAngry": (anger if anger > 0 else 0),
	"emHappy": (happy if happy > 0 else 0),
	"emSad": (sad if sad > 0 else 0),
	"emSurprise": (surprise if surprise > 0 else 0),
	"run_model": use_deepmoji
	}


	data = {
	'pluginsContext': json.dumps(pluginsContext),
	'modelType': model_type,
	# pad with whitespaces as a workaround to avoid cutoffs
	'sequence': input_text.center(len(input_text) + 2, ' '),
	'pace': pace,
	'outfile': save_path,
	'vocoder': 'n/a',
	'base_lang': language,
	'base_emb': base_speaker_emb,
	'useSR': use_sr,
	'useCleanup': use_cleanup,
	}

	print('Synthesizing...')
	try:
	json_data = xvaserver.synthesize(data)
	# response = requests.post('http://0.0.0.0:8008/synthesize', json=data, timeout=60)
	# response.raise_for_status() # If the response contains an HTTP error status code, raise an exception
	# json_data = json.loads(response.text)
	except requests.exceptions.RequestException as err:
	print('FAILED to synthesize: {err}')
	save_path = ''
	response = {'text': '{"message": "Failed"}'}
	json_data = {
	'arpabet': ['Failed'],
	'durations': [0],
	'em_anger': anger,
	'em_happy': happy,
	'em_sad': sad,
	'em_surprise': surprise,
	}

	# print('server.log contents:')
	# with open('resources/app/server.log', 'r') as f:
	# print(f.read())

	arpabet_html = '<h6>ARPAbet & Phoneme lengths</h6>'
	arpabet_symbols = json_data['arpabet'].split('\|')
	utter_time = 0
	for symb_i in range(len(json_data['durations'])):
	# skip PAD symbol
	if (arpabet_symbols[symb_i] == '<PAD>'):
	continue

	length = float(json_data['durations'][symb_i])
	arpa_length = str(round(length/2, 1))
	arpabet_html += '<strong\
	class="arpabet"\
	style="padding: 0 '\
	+ str(arpa_length)\
	+'em"'\
	+f" title=\"{utter_time} + {length}\""\
	+'>'\
	+ arpabet_symbols[symb_i]\
	+ '</strong> '
	utter_time += round(length, 1)

	return [
	save_path,
	arpabet_html,
	round(json_data['em_angry'][0], 2),
	round(json_data['em_happy'][0], 2),
	round(json_data['em_sad'][0], 2),
	round(json_data['em_surprise'][0], 2),
	json_data
	]

	if __name__ == "__main__":
	print('running custom Gradio interface')
	demo = LocalBlocksDemo(models_path, hf_cache_lojban_models_path)
	demo.block.launch()