Spaces:

Pendrokar
/

xVASynth-TTS

Running on CPU Upgrade

File size: 4,854 Bytes

222e3bd
e935ff6
ea6a933
83ebf46
ef93563
c79df46
ef93563
8871135
6f94cd7
74e078c
 
8e17b76
f2cf91b
ef93563
 
 
8e17b76
3b01bbd
 
 
b975979
294478a
f2cf91b
 
f70eab2
044fea4
f2cf91b
9366d4b
b975979
 
 
 
 
 
 
 
 
5c4653b
 
 
 
f2cf91b
5c4653b
 
 
 
a65b632
044fea4
 
22b65d9
a65b632
477ec86
9366d4b
f70eab2
044fea4
 
 
c1ee979
fdc4995
c1ee979
fdc4995
c1ee979
a65b632
22b65d9
9366d4b
044fea4
5c4653b
1cd6967
74e078c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
f70eab2
74e078c
 
 
 
 
 
 
 
 
 
 
 
 
1ae721a
dcbacca
76e9cbc
74e078c
 
 
 
 
 
 
 
 
 
 
 
 
76e9cbc
74e078c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
76e9cbc
294478a
4ea5474
294478a
c1ee979
74e078c

import os
import sys
import requests
import json
from huggingface_hub import HfApi

# start xVASynth service (no HTTP)
import resources.app.no_server as xvaserver

from gr_client import BlocksDemo

# model
hf_model_name = "Pendrokar/xvapitch_nvidia"
model_repo = HfApi()
commits = model_repo.list_repo_commits(repo_id=hf_model_name)
latest_commit_sha = commits[0].commit_id
hf_cache_models_path = f'/home/user/.cache/huggingface/hub/models--Pendrokar--xvapitch_nvidia/snapshots/{latest_commit_sha}/'

commits = model_repo.list_repo_commits(repo_id='Pendrokar/xvasynth_lojban')
latest_commit_sha = commits[0].commit_id
hf_cache_lojban_models_path = f'/home/user/.cache/huggingface/hub/models--Pendrokar--xvasynth_lojban/snapshots/{latest_commit_sha}/'
models_path = hf_cache_models_path

current_voice_model = None
current_voice_type = None
base_speaker_emb = ''

def load_model(voice_model_name):
	if voice_model_name == 'x_selpahi':
		# Lojban
		model_path =  hf_cache_lojban_models_path + voice_model_name
		model_type = 'FastPitch1.1'
	else:
		model_path =  models_path + voice_model_name
		model_type = 'xVAPitch'

	language = 'en' # seems to have no effect if generated text is from a different language

	data = {
		'outputs': None,
		'version': '3.0',
		'model': model_path,
		'modelType': model_type,
		'base_lang': language,
		'pluginsContext': '{}',
	}

	embs = base_speaker_emb

	print('Loading voice model...')
	try:
		json_data = xvaserver.loadModel(data)
		current_voice_model = voice_model_name
		current_voice_type = model_type

		with open(model_path + '.json', 'r', encoding='utf-8') as f:
		    voice_model_json = json.load(f)

		if model_type == 'xVAPitch':
			embs = voice_model_json['games'][0]['base_speaker_emb']
		elif model_type == 'FastPitch1.1':
			embs = voice_model_json['games'][0]['resemblyzer']
	except requests.exceptions.RequestException as err:
		print(f'FAILED to load voice model: {err}')

	return embs


class LocalBlocksDemo(BlocksDemo):
	def predict(
		self,
		input_text,
		voice,
		lang,
		pacing,
		pitch,
		energy,
		anger,
		happy,
		sad,
		surprise,
		use_deepmoji
	):
		# grab only the first 1000 characters
		input_text = input_text[:1000]

		# load voice model if not the current model
		if (current_voice_model != voice):
			base_speaker_emb = load_model(voice)

		model_type = current_voice_type
		pace = pacing if pacing else 1.0
		save_path = '/tmp/xvapitch_audio_sample.wav'
		language = lang
		use_sr = 0
		use_cleanup = 0

		pluginsContext = {}
		pluginsContext["mantella_settings"] = {
			"emAngry": (anger if anger > 0 else 0),
			"emHappy": (happy if happy > 0 else 0),
			"emSad": (sad if sad > 0 else 0),
			"emSurprise": (surprise if surprise > 0 else 0),
			"run_model": use_deepmoji
		}


		data = {
			'pluginsContext': json.dumps(pluginsContext),
			'modelType': model_type,
			# pad with whitespaces as a workaround to avoid cutoffs
			'sequence': input_text.center(len(input_text) + 2, ' '),
			'pace': pace,
			'outfile': save_path,
			'vocoder': 'n/a',
			'base_lang': language,
			'base_emb': base_speaker_emb,
			'useSR': use_sr,
			'useCleanup': use_cleanup,
		}

		print('Synthesizing...')
		try:
			json_data = xvaserver.synthesize(data)
			# response = requests.post('http://0.0.0.0:8008/synthesize', json=data, timeout=60)
			# response.raise_for_status()  # If the response contains an HTTP error status code, raise an exception
			# json_data = json.loads(response.text)
		except requests.exceptions.RequestException as err:
			print('FAILED to synthesize: {err}')
			save_path = ''
			response = {'text': '{"message": "Failed"}'}
			json_data = {
				'arpabet': ['Failed'],
				'durations': [0],
				'em_anger': anger,
				'em_happy': happy,
				'em_sad': sad,
				'em_surprise': surprise,
			}

		# print('server.log contents:')
		# with open('resources/app/server.log', 'r') as f:
		# 	print(f.read())

		arpabet_html = '<h6>ARPAbet & Phoneme lengths</h6>'
		arpabet_symbols = json_data['arpabet'].split('|')
		utter_time = 0
		for symb_i in range(len(json_data['durations'])):
			# skip PAD symbol
			if (arpabet_symbols[symb_i] == '<PAD>'):
				continue

			length = float(json_data['durations'][symb_i])
			arpa_length = str(round(length/2, 1))
			arpabet_html += '<strong\
				class="arpabet"\
				style="padding: 0 '\
				+ str(arpa_length)\
				+'em"'\
				+f" title=\"{utter_time} + {length}\""\
				+'>'\
				+ arpabet_symbols[symb_i]\
				+ '</strong> '
			utter_time += round(length, 1)

		return [
			save_path,
			arpabet_html,
			round(json_data['em_angry'][0], 2),
			round(json_data['em_happy'][0], 2),
			round(json_data['em_sad'][0], 2),
			round(json_data['em_surprise'][0], 2),
			json_data
		]

if __name__ == "__main__":
	print('running custom Gradio interface')
	demo = LocalBlocksDemo(models_path, hf_cache_lojban_models_path)
	demo.block.launch()